Skip to content

Commit d274f6c

Browse files
committed
make the project run on CPU
1 parent 27469df commit d274f6c

File tree

9 files changed

+416
-49
lines changed

9 files changed

+416
-49
lines changed

gamesense/README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,44 @@ For custom data sources, you'll need to prepare the splits in a Hugging Face dat
206206
## 📚 Documentation
207207

208208
For learning more about how to use ZenML to build your own MLOps pipelines, refer to our comprehensive [ZenML documentation](https://docs.zenml.io/).
209+
210+
## Running on CPU-only Environment
211+
212+
If you don't have access to a GPU, you can still run this project with the CPU-only configuration. We've made several optimizations to make this project work on CPU, including:
213+
214+
- Smaller batch sizes for reduced memory footprint
215+
- Fewer training steps
216+
- Disabled GPU-specific features (quantization, bf16, etc.)
217+
- Using smaller test datasets for evaluation
218+
- Special handling for Phi-3.5 model caching issues on CPU
219+
220+
To run the project on CPU:
221+
222+
```bash
223+
python run.py --config phi3.5_finetune_cpu.yaml
224+
```
225+
226+
Note that training on CPU will be significantly slower than training on a GPU. The CPU configuration uses:
227+
228+
1. A smaller model (Phi-3.5-mini-instruct) which is more CPU-friendly
229+
2. Reduced batch size and increased gradient accumulation steps
230+
3. Fewer total training steps (50 instead of 300)
231+
4. Half-precision (float16) where possible to reduce memory usage
232+
5. Smaller dataset subsets (100 training samples, 20 validation samples, 10 test samples)
233+
6. Special compatibility settings for Phi models running on CPU
234+
235+
For best results, we recommend:
236+
- Using a machine with at least 16GB of RAM
237+
- Being patient! LLM training on CPU is much slower than on GPU
238+
- If you still encounter memory issues, try reducing the max_train_samples parameter even further in the config file
239+
240+
### Known Issues and Workarounds
241+
242+
Some large language models like Phi-3.5 have caching mechanisms that are optimized for GPU usage and may encounter issues when running on CPU. Our CPU configuration includes several workarounds:
243+
244+
1. Disabling KV caching for model generation
245+
2. Using torch.float16 data type to reduce memory usage
246+
3. Disabling flash attention which isn't needed on CPU
247+
4. Using standard AdamW optimizer instead of 8-bit optimizers that require GPU
248+
249+
These changes allow the model to run on CPU with less memory and avoid compatibility issues, although at the cost of some performance.

gamesense/pipelines/train.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ def llm_peft_full_finetune(
3333
use_fast: bool = True,
3434
load_in_8bit: bool = False,
3535
load_in_4bit: bool = False,
36+
cpu_only: bool = False,
37+
max_train_samples: int = None,
38+
max_val_samples: int = None,
39+
max_test_samples: int = None,
3640
):
3741
"""Pipeline for finetuning an LLM with peft.
3842
@@ -42,20 +46,39 @@ def llm_peft_full_finetune(
4246
- finetune: finetune the model
4347
- evaluate_model: evaluate the base and finetuned model
4448
- promote: promote the model to the target stage, if evaluation was successful
49+
50+
Args:
51+
system_prompt: The system prompt to use.
52+
base_model_id: The base model id to use.
53+
use_fast: Whether to use the fast tokenizer.
54+
load_in_8bit: Whether to load in 8-bit precision (requires GPU).
55+
load_in_4bit: Whether to load in 4-bit precision (requires GPU).
56+
cpu_only: Whether to force using CPU only and disable quantization.
57+
max_train_samples: Maximum number of training samples to use (for CPU or testing).
58+
max_val_samples: Maximum number of validation samples to use (for CPU or testing).
59+
max_test_samples: Maximum number of test samples to use (for CPU or testing).
4560
"""
46-
if not load_in_8bit and not load_in_4bit:
47-
raise ValueError(
48-
"At least one of `load_in_8bit` and `load_in_4bit` must be True."
49-
)
50-
if load_in_4bit and load_in_8bit:
51-
raise ValueError(
52-
"Only one of `load_in_8bit` and `load_in_4bit` can be True."
53-
)
61+
if not cpu_only:
62+
if not load_in_8bit and not load_in_4bit:
63+
raise ValueError(
64+
"At least one of `load_in_8bit` and `load_in_4bit` must be True when not in CPU-only mode."
65+
)
66+
if load_in_4bit and load_in_8bit:
67+
raise ValueError(
68+
"Only one of `load_in_8bit` and `load_in_4bit` can be True."
69+
)
70+
71+
if cpu_only:
72+
load_in_8bit = False
73+
load_in_4bit = False
5474

5575
datasets_dir = prepare_data(
5676
base_model_id=base_model_id,
5777
system_prompt=system_prompt,
5878
use_fast=use_fast,
79+
max_train_samples=max_train_samples,
80+
max_val_samples=max_val_samples,
81+
max_test_samples=max_test_samples,
5982
)
6083

6184
evaluate_model(
@@ -66,6 +89,7 @@ def llm_peft_full_finetune(
6689
use_fast=use_fast,
6790
load_in_8bit=load_in_8bit,
6891
load_in_4bit=load_in_4bit,
92+
cpu_only=cpu_only,
6993
id="evaluate_base",
7094
)
7195
log_metadata_from_step_artifact(
@@ -82,6 +106,8 @@ def llm_peft_full_finetune(
82106
load_in_8bit=load_in_8bit,
83107
load_in_4bit=load_in_4bit,
84108
use_accelerate=False,
109+
cpu_only=cpu_only,
110+
bf16=not cpu_only,
85111
)
86112

87113
evaluate_model(
@@ -92,6 +118,7 @@ def llm_peft_full_finetune(
92118
use_fast=use_fast,
93119
load_in_8bit=load_in_8bit,
94120
load_in_4bit=load_in_4bit,
121+
cpu_only=cpu_only,
95122
id="evaluate_finetuned",
96123
)
97124
log_metadata_from_step_artifact(

gamesense/pipelines/train_accelerated.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ def llm_peft_full_finetune(
3434
use_fast: bool = True,
3535
load_in_8bit: bool = False,
3636
load_in_4bit: bool = False,
37+
max_train_samples: int = None,
38+
max_val_samples: int = None,
39+
max_test_samples: int = None,
3740
):
3841
"""Pipeline for finetuning an LLM with peft.
3942
@@ -43,6 +46,16 @@ def llm_peft_full_finetune(
4346
- finetune: finetune the model
4447
- evaluate_model: evaluate the base and finetuned model
4548
- promote: promote the model to the target stage, if evaluation was successful
49+
50+
Args:
51+
system_prompt: The system prompt to use.
52+
base_model_id: The base model id to use.
53+
use_fast: Whether to use the fast tokenizer.
54+
load_in_8bit: Whether to load in 8-bit precision (requires GPU).
55+
load_in_4bit: Whether to load in 4-bit precision (requires GPU).
56+
max_train_samples: Maximum number of training samples to use (for CPU or testing).
57+
max_val_samples: Maximum number of validation samples to use (for CPU or testing).
58+
max_test_samples: Maximum number of test samples to use (for CPU or testing).
4659
"""
4760
if not load_in_8bit and not load_in_4bit:
4861
raise ValueError(
@@ -57,6 +70,9 @@ def llm_peft_full_finetune(
5770
base_model_id=base_model_id,
5871
system_prompt=system_prompt,
5972
use_fast=use_fast,
73+
max_train_samples=max_train_samples,
74+
max_val_samples=max_val_samples,
75+
max_test_samples=max_test_samples,
6076
)
6177

6278
evaluate_model(

gamesense/run.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,19 @@ def main(
7676
if not config:
7777
raise RuntimeError("Config file is required to run a pipeline.")
7878

79-
pipeline_args["config_path"] = os.path.join(config_folder, config)
79+
config_path = os.path.join(config_folder, config)
80+
pipeline_args["config_path"] = config_path
81+
82+
# Display a message if using CPU configuration
83+
if "cpu" in config:
84+
print("\n" + "="*80)
85+
print("RUNNING IN CPU-ONLY MODE")
86+
print("This will use a CPU-optimized configuration with:")
87+
print("- Smaller batch sizes")
88+
print("- Fewer training steps")
89+
print("- Disabled GPU-specific features (quantization, bf16, etc)")
90+
print("Note: Training will be much slower but should require less memory")
91+
print("="*80 + "\n")
8092

8193
if accelerate:
8294
from pipelines.train_accelerated import llm_peft_full_finetune

gamesense/steps/evaluate_model.py

Lines changed: 112 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def evaluate_model(
4545
use_fast: bool = True,
4646
load_in_4bit: bool = False,
4747
load_in_8bit: bool = False,
48+
cpu_only: bool = False,
4849
) -> None:
4950
"""Evaluate the model with ROUGE metrics.
5051
@@ -57,7 +58,13 @@ def evaluate_model(
5758
use_fast: Whether to use the fast tokenizer.
5859
load_in_4bit: Whether to load the model in 4bit mode.
5960
load_in_8bit: Whether to load the model in 8bit mode.
61+
cpu_only: Whether to force using CPU only and disable quantization.
6062
"""
63+
# Force disable GPU optimizations if in CPU-only mode
64+
if cpu_only:
65+
load_in_4bit = False
66+
load_in_8bit = False
67+
6168
cleanup_gpu_memory(force=True)
6269

6370
# authenticate with Hugging Face for gated repos
@@ -79,7 +86,14 @@ def evaluate_model(
7986
use_fast=use_fast,
8087
)
8188
test_dataset = load_from_disk(str((datasets_dir / "test_raw").absolute()))
82-
test_dataset = test_dataset[:50]
89+
90+
# Reduce dataset size for CPU evaluation to make it more manageable
91+
if cpu_only:
92+
logger.info("CPU-only mode: Using a smaller test dataset subset")
93+
test_dataset = test_dataset[:10] # Use only 10 samples for CPU
94+
else:
95+
test_dataset = test_dataset[:50] # Use 50 samples for GPU
96+
8397
ground_truths = test_dataset["meaning_representation"]
8498
tokenized_train_dataset = tokenize_for_eval(
8599
test_dataset, tokenizer, system_prompt
@@ -92,23 +106,114 @@ def evaluate_model(
92106
is_training=False,
93107
load_in_4bit=load_in_4bit,
94108
load_in_8bit=load_in_8bit,
109+
cpu_only=cpu_only,
95110
)
96111
else:
97112
logger.info("Generating using finetuned model...")
98113
model = load_pretrained_model(
99114
ft_model_dir,
100115
load_in_4bit=load_in_4bit,
101116
load_in_8bit=load_in_8bit,
117+
cpu_only=cpu_only,
102118
)
103119

104120
model.eval()
121+
122+
# Adjust generation parameters for CPU
123+
max_new_tokens = 30 if cpu_only else 100
124+
125+
# Preemptively disable use_cache for Phi models on CPU to avoid 'get_max_length' error
126+
is_phi_model = "phi" in base_model_id.lower()
127+
use_cache = not (is_phi_model and cpu_only)
128+
129+
if not use_cache:
130+
logger.info("Preemptively disabling KV cache for Phi model on CPU")
131+
if hasattr(model.config, "use_cache"):
132+
model.config.use_cache = False
133+
105134
with torch.no_grad():
106-
predictions = model.generate(
107-
input_ids=tokenized_train_dataset["input_ids"],
108-
attention_mask=tokenized_train_dataset["attention_mask"],
109-
max_new_tokens=100,
110-
pad_token_id=2,
111-
)
135+
try:
136+
# Move inputs to the same device as the model
137+
device = next(model.parameters()).device
138+
input_ids = tokenized_train_dataset["input_ids"].to(device)
139+
attention_mask = tokenized_train_dataset["attention_mask"].to(device)
140+
141+
# Generate with appropriate parameters
142+
logger.info(f"Generating with use_cache={use_cache}")
143+
predictions = model.generate(
144+
input_ids=input_ids,
145+
attention_mask=attention_mask,
146+
max_new_tokens=max_new_tokens,
147+
pad_token_id=2,
148+
use_cache=use_cache, # Use the preemptively determined setting
149+
do_sample=False # Use greedy decoding for more stable results on CPU
150+
)
151+
except (AttributeError, RuntimeError) as e:
152+
logger.warning(f"Initial generation attempt failed with error: {str(e)}")
153+
154+
# First fallback: try with more safety settings
155+
if "get_max_length" in str(e) or "DynamicCache" in str(e) or cpu_only:
156+
logger.warning("Using fallback generation strategy with minimal parameters")
157+
try:
158+
# Force model to CPU if needed
159+
if not str(next(model.parameters()).device) == "cpu":
160+
logger.info("Moving model to CPU for generation")
161+
model = model.to("cpu")
162+
163+
# Move inputs to CPU
164+
input_ids = tokenized_train_dataset["input_ids"].to("cpu")
165+
attention_mask = tokenized_train_dataset["attention_mask"].to("cpu")
166+
167+
predictions = model.generate(
168+
input_ids=input_ids,
169+
attention_mask=attention_mask,
170+
max_new_tokens=20, # Even smaller for safety
171+
pad_token_id=2,
172+
use_cache=False, # Disable KV caching completely
173+
do_sample=False, # Use greedy decoding
174+
num_beams=1 # Simple beam search
175+
)
176+
except (RuntimeError, Exception) as e2:
177+
logger.warning(f"Second generation attempt failed with error: {str(e2)}")
178+
179+
# Final fallback: process one sample at a time
180+
logger.warning("Final fallback: processing one sample at a time")
181+
182+
# Process one sample at a time
183+
all_predictions = []
184+
batch_size = tokenized_train_dataset["input_ids"].shape[0]
185+
186+
for i in range(batch_size):
187+
try:
188+
# Process one sample at a time
189+
single_input = tokenized_train_dataset["input_ids"][i:i+1].to("cpu")
190+
single_attention = tokenized_train_dataset["attention_mask"][i:i+1].to("cpu")
191+
192+
single_pred = model.generate(
193+
input_ids=single_input,
194+
attention_mask=single_attention,
195+
max_new_tokens=20, # Even further reduced for safety
196+
num_beams=1,
197+
do_sample=False,
198+
use_cache=False,
199+
pad_token_id=2,
200+
)
201+
all_predictions.append(single_pred)
202+
except Exception as sample_error:
203+
logger.error(f"Failed to generate for sample {i}: {str(sample_error)}")
204+
# Create an empty prediction as placeholder
205+
all_predictions.append(tokenized_train_dataset["input_ids"][i:i+1])
206+
207+
# Combine the individual predictions
208+
if all_predictions:
209+
predictions = torch.cat(all_predictions, dim=0)
210+
else:
211+
# If all samples failed, return original inputs
212+
logger.error("All samples failed in generation. Using inputs as fallback.")
213+
predictions = tokenized_train_dataset["input_ids"]
214+
else:
215+
# Re-raise if not a cache-related issue
216+
raise e
112217
predictions = tokenizer.batch_decode(
113218
predictions[:, tokenized_train_dataset["input_ids"].shape[1] :],
114219
skip_special_tokens=True,

0 commit comments

Comments
 (0)