Skip to content

Commit c7b6790

Browse files
authored
Various fixes for CI, save_only_model for RL, prevent packing multiprocessing deadlocks (axolotl-ai-cloud#2661)
* lean mistral ft tests, remove e2e torch 2.4.1 test * make sure to pass save_only_model for RL * more tests to make ci leaner, add cleanup to modal ci * fix module for import in e2e tests * use mp spawn to prevent deadlocks with packing * make sure cleanup shell script is executable when cloned out
1 parent 47e0e71 commit c7b6790

File tree

13 files changed

+190
-99
lines changed

13 files changed

+190
-99
lines changed

.github/workflows/tests.yml

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -335,12 +335,6 @@ jobs:
335335
pytorch: 2.6.0
336336
num_gpus: 1
337337
axolotl_extras: llmcompressor
338-
- cuda: 124
339-
cuda_version: 12.4.1
340-
python_version: "3.11"
341-
pytorch: 2.4.1
342-
num_gpus: 1
343-
axolotl_extras:
344338
- cuda: 124
345339
cuda_version: 12.4.1
346340
python_version: "3.11"
@@ -377,3 +371,43 @@ jobs:
377371
- name: Run tests job on Modal
378372
run: |
379373
modal run cicd.e2e_tests
374+
375+
docker-e2e-cleanup:
376+
runs-on: [self-hosted, modal]
377+
timeout-minutes: 90
378+
needs: [docker-e2e-tests]
379+
380+
strategy:
381+
fail-fast: false
382+
matrix:
383+
include:
384+
- cuda: 124
385+
cuda_version: 12.4.1
386+
python_version: "3.11"
387+
pytorch: 2.6.0
388+
num_gpus: 1
389+
axolotl_extras: vllm
390+
steps:
391+
- name: Checkout
392+
uses: actions/checkout@v4
393+
- name: Install Python
394+
uses: actions/setup-python@v5
395+
with:
396+
python-version: "3.11"
397+
- name: Install Modal
398+
run: |
399+
python -m pip install --upgrade pip
400+
pip install modal==0.71.8 jinja2
401+
- name: Update env vars
402+
run: |
403+
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
404+
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
405+
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
406+
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
407+
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
408+
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
409+
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
410+
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
411+
- name: Run tests job on Modal
412+
run: |
413+
modal run cicd.cleanup

cicd/__init__.py

Whitespace-only changes.

cicd/cicd.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
1818
--cov-append
1919

2020
# Run patched tests excluding lora kernels with coverage append
21-
pytest -v --durations=10 \
21+
pytest --full-trace -vvv --durations=10 \
2222
--ignore=tests/e2e/patched/lora_kernels \
2323
/workspace/axolotl/tests/e2e/patched \
2424
--cov=axolotl \

cicd/cleanup.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""Modal app to run axolotl GPU cleanup"""
2+
3+
from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
4+
5+
6+
@app.function(
7+
image=cicd_image,
8+
timeout=60 * 60,
9+
cpu=8.0,
10+
memory=131072,
11+
volumes=VOLUME_CONFIG,
12+
)
13+
def cleanup():
14+
run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
15+
16+
17+
@app.local_entrypoint()
18+
def main():
19+
cleanup.remote()

cicd/cleanup.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# cleanup old cache files for datasets processing and intermediate mappings
5+
find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
6+
find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;

cicd/e2e_tests.py

Lines changed: 1 addition & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,6 @@
11
"""Modal app to run axolotl GPU tests"""
22

3-
# pylint: disable=duplicate-code
4-
5-
import os
6-
import pathlib
7-
import tempfile
8-
9-
import jinja2
10-
import modal
11-
from jinja2 import select_autoescape
12-
from modal import App, Image
13-
14-
cicd_path = pathlib.Path(__file__).parent.resolve()
15-
16-
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
17-
template_env = jinja2.Environment(
18-
loader=template_loader, autoescape=select_autoescape()
19-
)
20-
df_template = template_env.get_template("Dockerfile.jinja")
21-
22-
df_args = {
23-
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
24-
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
25-
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
26-
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
27-
"CUDA": os.environ.get("CUDA", "121"),
28-
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
29-
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
30-
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
31-
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
32-
"HF_HOME": "/workspace/data/huggingface-cache/hub",
33-
}
34-
35-
dockerfile_contents = df_template.render(**df_args)
36-
37-
temp_dir = tempfile.mkdtemp()
38-
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
39-
f.write(dockerfile_contents)
40-
41-
cicd_image = Image.from_dockerfile(
42-
pathlib.Path(temp_dir) / "Dockerfile",
43-
context_mount=None,
44-
force_build=True,
45-
gpu="A10G",
46-
).env(df_args)
47-
48-
app = App("Axolotl CI/CD", secrets=[])
49-
50-
hf_cache_volume = modal.Volume.from_name(
51-
"axolotl-ci-hf-hub-cache", create_if_missing=True
52-
)
53-
VOLUME_CONFIG = {
54-
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
55-
}
56-
57-
N_GPUS = int(os.environ.get("N_GPUS", 1))
58-
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
59-
60-
61-
def run_cmd(cmd: str, run_folder: str):
62-
import subprocess # nosec
63-
64-
# Propagate errors from subprocess.
65-
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
66-
exit(exit_code) # pylint: disable=consider-using-sys-exit
3+
from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
674

685

696
@app.function(

cicd/single_gpu.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Modal app to run axolotl GPU tests"""
2+
3+
# pylint: disable=duplicate-code
4+
5+
import os
6+
import pathlib
7+
import tempfile
8+
9+
import jinja2
10+
import modal
11+
from jinja2 import select_autoescape
12+
from modal import App, Image
13+
14+
cicd_path = pathlib.Path(__file__).parent.resolve()
15+
16+
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
17+
template_env = jinja2.Environment(
18+
loader=template_loader, autoescape=select_autoescape()
19+
)
20+
df_template = template_env.get_template("Dockerfile.jinja")
21+
22+
df_args = {
23+
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
24+
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
25+
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
26+
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
27+
"CUDA": os.environ.get("CUDA", "121"),
28+
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
29+
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
30+
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
31+
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
32+
"HF_HOME": "/workspace/data/huggingface-cache/hub",
33+
}
34+
35+
dockerfile_contents = df_template.render(**df_args)
36+
37+
temp_dir = tempfile.mkdtemp()
38+
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
39+
f.write(dockerfile_contents)
40+
41+
cicd_image = Image.from_dockerfile(
42+
pathlib.Path(temp_dir) / "Dockerfile",
43+
context_mount=None,
44+
force_build=True,
45+
gpu="A10G",
46+
).env(df_args)
47+
48+
app = App("Axolotl CI/CD", secrets=[])
49+
50+
hf_cache_volume = modal.Volume.from_name(
51+
"axolotl-ci-hf-hub-cache", create_if_missing=True
52+
)
53+
VOLUME_CONFIG = {
54+
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
55+
}
56+
57+
N_GPUS = int(os.environ.get("N_GPUS", 1))
58+
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
59+
60+
61+
def run_cmd(cmd: str, run_folder: str):
62+
import subprocess # nosec
63+
64+
# Propagate errors from subprocess.
65+
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
66+
exit(exit_code) # pylint: disable=consider-using-sys-exit

src/axolotl/core/trainer_builder.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,8 @@ def build_training_arguments(self, total_num_steps):
10571057
# default to saving each epoch if not defined
10581058
training_args_kwargs["save_strategy"] = "epoch"
10591059

1060+
training_args_kwargs["save_only_model"] = self.cfg.save_only_model
1061+
10601062
if self.cfg.dataset_processes:
10611063
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
10621064

src/axolotl/utils/samplers/multipack.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import logging
77
import math
88
from concurrent.futures import ProcessPoolExecutor
9-
from multiprocessing import cpu_count
9+
from multiprocessing import cpu_count, get_context
1010
from typing import Iterable, Union
1111

1212
import numba
@@ -126,6 +126,7 @@ def pack_parallel(
126126
bin_size: int,
127127
num_processes: int | None = None,
128128
safe_mode: bool = True,
129+
mp_start_method: str | None = "spawn",
129130
):
130131
"""
131132
Pack sequences into bins using parallel processing
@@ -137,7 +138,9 @@ def pack_parallel(
137138
bin_size: Maximum number of bins to use
138139
num_processes: Number of parallel processes to use
139140
safe_mode: If True, use a more conservative packing approach
140-
141+
mp_start_method: Multiprocessing start method ('fork', 'spawn', 'forkserver').
142+
'spawn' is often safer with Numba/PyTorch.
143+
Set to None to use system default.
141144
Returns:
142145
List of bins, where each bin contains indices of sequences assigned to it
143146
"""
@@ -154,9 +157,33 @@ def pack_parallel(
154157

155158
# Process groups in parallel
156159
all_bins = []
157-
with ProcessPoolExecutor(max_workers=num_processes) as executor:
158-
for group_bins in executor.map(_process_group, tasks):
160+
161+
mp_ctx = None
162+
if mp_start_method:
163+
try:
164+
mp_ctx = get_context(mp_start_method)
165+
except ValueError:
166+
LOG.warning(
167+
f"Failed to get multiprocessing context '{mp_start_method}'. "
168+
f"Falling back to default. Available: {get_context().get_all_start_methods()}"
169+
)
170+
mp_ctx = (
171+
None # Fallback to default context if specified one is not available
172+
)
173+
174+
if num_processes == 1:
175+
LOG.debug("Using single process for pack_parallel, running sequentially.")
176+
for task_args in tasks:
177+
group_bins = _process_group(task_args)
159178
all_bins.extend(group_bins)
179+
else:
180+
# Use ProcessPoolExecutor only if num_processes > 1
181+
# Pass mp_context if available
182+
with ProcessPoolExecutor(
183+
max_workers=num_processes, mp_context=mp_ctx
184+
) as executor:
185+
for group_bins in executor.map(_process_group, tasks):
186+
all_bins.extend(group_bins)
160187

161188
return all_bins
162189

tests/e2e/patched/test_4d_multipack_llama.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ def test_sdp_lora_packing(self, temp_dir):
5757
"learning_rate": 0.00001,
5858
"optimizer": "adamw_torch_fused",
5959
"lr_scheduler": "cosine",
60-
"max_steps": 20,
61-
"save_steps": 10,
62-
"eval_steps": 10,
60+
"max_steps": 5,
61+
"save_steps": 3,
62+
"eval_steps": 4,
6363
"fp16": True,
6464
}
6565
)
@@ -105,9 +105,9 @@ def test_torch_lora_packing(self, temp_dir):
105105
"learning_rate": 0.00001,
106106
"optimizer": "adamw_torch_fused",
107107
"lr_scheduler": "cosine",
108-
"max_steps": 20,
109-
"save_steps": 10,
110-
"eval_steps": 10,
108+
"max_steps": 5,
109+
"save_steps": 3,
110+
"eval_steps": 4,
111111
"fp16": True,
112112
}
113113
)

0 commit comments

Comments
 (0)