zenml-io
diff --git a/‎gamesense/README.md‎
Lines changed: 41 additions & 0 deletions b/‎gamesense/README.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎gamesense/pipelines/train.py‎
Lines changed: 35 additions & 8 deletions b/‎gamesense/pipelines/train.py‎
Lines changed: 35 additions & 8 deletions
diff --git a/‎gamesense/pipelines/train_accelerated.py‎
Lines changed: 16 additions & 0 deletions b/‎gamesense/pipelines/train_accelerated.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎gamesense/run.py‎
Lines changed: 13 additions & 1 deletion b/‎gamesense/run.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎gamesense/steps/evaluate_model.py‎
Lines changed: 112 additions & 7 deletions b/‎gamesense/steps/evaluate_model.py‎
Lines changed: 112 additions & 7 deletions
@@ -206,3 +206,44 @@ For custom data sources, you'll need to prepare the splits in a Hugging Face dat
 ## 📚 Documentation
 
 For learning more about how to use ZenML to build your own MLOps pipelines, refer to our comprehensive [ZenML documentation](https://docs.zenml.io/).
+
+## Running on CPU-only Environment
+
+If you don't have access to a GPU, you can still run this project with the CPU-only configuration. We've made several optimizations to make this project work on CPU, including:
+
+- Smaller batch sizes for reduced memory footprint
+- Fewer training steps
+- Disabled GPU-specific features (quantization, bf16, etc.)
+- Using smaller test datasets for evaluation
+- Special handling for Phi-3.5 model caching issues on CPU
+
+To run the project on CPU:
+
+```bash
+python run.py --config phi3.5_finetune_cpu.yaml
+```
+
+Note that training on CPU will be significantly slower than training on a GPU. The CPU configuration uses:
+
+1. A smaller model (Phi-3.5-mini-instruct) which is more CPU-friendly
+2. Reduced batch size and increased gradient accumulation steps
+3. Fewer total training steps (50 instead of 300)
+4. Half-precision (float16) where possible to reduce memory usage
+5. Smaller dataset subsets (100 training samples, 20 validation samples, 10 test samples)
+6. Special compatibility settings for Phi models running on CPU
+
+For best results, we recommend:
+- Using a machine with at least 16GB of RAM
+- Being patient! LLM training on CPU is much slower than on GPU
+- If you still encounter memory issues, try reducing the max_train_samples parameter even further in the config file
+
+### Known Issues and Workarounds
+
+Some large language models like Phi-3.5 have caching mechanisms that are optimized for GPU usage and may encounter issues when running on CPU. Our CPU configuration includes several workarounds:
+
+1. Disabling KV caching for model generation
+2. Using torch.float16 data type to reduce memory usage
+3. Disabling flash attention which isn't needed on CPU
+4. Using standard AdamW optimizer instead of 8-bit optimizers that require GPU
+
+These changes allow the model to run on CPU with less memory and avoid compatibility issues, although at the cost of some performance.
@@ -33,6 +33,10 @@ def llm_peft_full_finetune(
     use_fast: bool = True,
     load_in_8bit: bool = False,
     load_in_4bit: bool = False,
+    cpu_only: bool = False,
+    max_train_samples: int = None,
+    max_val_samples: int = None,
+    max_test_samples: int = None,
 ):
     """Pipeline for finetuning an LLM with peft.
 
@@ -42,20 +46,39 @@ def llm_peft_full_finetune(
     - finetune: finetune the model
     - evaluate_model: evaluate the base and finetuned model
     - promote: promote the model to the target stage, if evaluation was successful
+    
+    Args:
+        system_prompt: The system prompt to use.
+        base_model_id: The base model id to use.
+        use_fast: Whether to use the fast tokenizer.
+        load_in_8bit: Whether to load in 8-bit precision (requires GPU).
+        load_in_4bit: Whether to load in 4-bit precision (requires GPU).
+        cpu_only: Whether to force using CPU only and disable quantization.
+        max_train_samples: Maximum number of training samples to use (for CPU or testing).
+        max_val_samples: Maximum number of validation samples to use (for CPU or testing).
+        max_test_samples: Maximum number of test samples to use (for CPU or testing).
     """
-    if not load_in_8bit and not load_in_4bit:
-        raise ValueError(
-            "At least one of `load_in_8bit` and `load_in_4bit` must be True."
-        )
-    if load_in_4bit and load_in_8bit:
-        raise ValueError(
-            "Only one of `load_in_8bit` and `load_in_4bit` can be True."
-        )
+    if not cpu_only:
+        if not load_in_8bit and not load_in_4bit:
+            raise ValueError(
+                "At least one of `load_in_8bit` and `load_in_4bit` must be True when not in CPU-only mode."
+            )
+        if load_in_4bit and load_in_8bit:
+            raise ValueError(
+                "Only one of `load_in_8bit` and `load_in_4bit` can be True."
+            )
+
+    if cpu_only:
+        load_in_8bit = False
+        load_in_4bit = False
 
     datasets_dir = prepare_data(
         base_model_id=base_model_id,
         system_prompt=system_prompt,
         use_fast=use_fast,
+        max_train_samples=max_train_samples,
+        max_val_samples=max_val_samples,
+        max_test_samples=max_test_samples,
     )
 
     evaluate_model(
@@ -66,6 +89,7 @@ def llm_peft_full_finetune(
         use_fast=use_fast,
         load_in_8bit=load_in_8bit,
         load_in_4bit=load_in_4bit,
+        cpu_only=cpu_only,
         id="evaluate_base",
     )
     log_metadata_from_step_artifact(
@@ -82,6 +106,8 @@ def llm_peft_full_finetune(
         load_in_8bit=load_in_8bit,
         load_in_4bit=load_in_4bit,
         use_accelerate=False,
+        cpu_only=cpu_only,
+        bf16=not cpu_only,
     )
 
     evaluate_model(
@@ -92,6 +118,7 @@ def llm_peft_full_finetune(
         use_fast=use_fast,
         load_in_8bit=load_in_8bit,
         load_in_4bit=load_in_4bit,
+        cpu_only=cpu_only,
         id="evaluate_finetuned",
     )
     log_metadata_from_step_artifact(
 
@@ -34,6 +34,9 @@ def llm_peft_full_finetune(
     use_fast: bool = True,
     load_in_8bit: bool = False,
     load_in_4bit: bool = False,
+    max_train_samples: int = None,
+    max_val_samples: int = None,
+    max_test_samples: int = None,
 ):
     """Pipeline for finetuning an LLM with peft.
 
@@ -43,6 +46,16 @@ def llm_peft_full_finetune(
     - finetune: finetune the model
     - evaluate_model: evaluate the base and finetuned model
     - promote: promote the model to the target stage, if evaluation was successful
+    
+    Args:
+        system_prompt: The system prompt to use.
+        base_model_id: The base model id to use.
+        use_fast: Whether to use the fast tokenizer.
+        load_in_8bit: Whether to load in 8-bit precision (requires GPU).
+        load_in_4bit: Whether to load in 4-bit precision (requires GPU).
+        max_train_samples: Maximum number of training samples to use (for CPU or testing).
+        max_val_samples: Maximum number of validation samples to use (for CPU or testing).
+        max_test_samples: Maximum number of test samples to use (for CPU or testing).
     """
     if not load_in_8bit and not load_in_4bit:
         raise ValueError(
@@ -57,6 +70,9 @@ def llm_peft_full_finetune(
         base_model_id=base_model_id,
         system_prompt=system_prompt,
         use_fast=use_fast,
+        max_train_samples=max_train_samples,
+        max_val_samples=max_val_samples,
+        max_test_samples=max_test_samples,
     )
 
     evaluate_model(
 
@@ -76,7 +76,19 @@ def main(
     if not config:
         raise RuntimeError("Config file is required to run a pipeline.")
 
-    pipeline_args["config_path"] = os.path.join(config_folder, config)
+    config_path = os.path.join(config_folder, config)
+    pipeline_args["config_path"] = config_path
+    
+    # Display a message if using CPU configuration
+    if "cpu" in config:
+        print("\n" + "="*80)
+        print("RUNNING IN CPU-ONLY MODE")
+        print("This will use a CPU-optimized configuration with:")
+        print("- Smaller batch sizes")
+        print("- Fewer training steps")
+        print("- Disabled GPU-specific features (quantization, bf16, etc)")
+        print("Note: Training will be much slower but should require less memory")
+        print("="*80 + "\n")
 
     if accelerate:
         from pipelines.train_accelerated import llm_peft_full_finetune
 
@@ -45,6 +45,7 @@ def evaluate_model(
     use_fast: bool = True,
     load_in_4bit: bool = False,
     load_in_8bit: bool = False,
+    cpu_only: bool = False,
 ) -> None:
     """Evaluate the model with ROUGE metrics.
 
@@ -57,7 +58,13 @@ def evaluate_model(
         use_fast: Whether to use the fast tokenizer.
         load_in_4bit: Whether to load the model in 4bit mode.
         load_in_8bit: Whether to load the model in 8bit mode.
+        cpu_only: Whether to force using CPU only and disable quantization.
     """
+    # Force disable GPU optimizations if in CPU-only mode
+    if cpu_only:
+        load_in_4bit = False
+        load_in_8bit = False
+    
     cleanup_gpu_memory(force=True)
 
     # authenticate with Hugging Face for gated repos
@@ -79,7 +86,14 @@ def evaluate_model(
         use_fast=use_fast,
     )
     test_dataset = load_from_disk(str((datasets_dir / "test_raw").absolute()))
-    test_dataset = test_dataset[:50]
+    
+    # Reduce dataset size for CPU evaluation to make it more manageable
+    if cpu_only:
+        logger.info("CPU-only mode: Using a smaller test dataset subset")
+        test_dataset = test_dataset[:10]  # Use only 10 samples for CPU
+    else:
+        test_dataset = test_dataset[:50]  # Use 50 samples for GPU
+        
     ground_truths = test_dataset["meaning_representation"]
     tokenized_train_dataset = tokenize_for_eval(
         test_dataset, tokenizer, system_prompt
@@ -92,23 +106,114 @@ def evaluate_model(
             is_training=False,
             load_in_4bit=load_in_4bit,
             load_in_8bit=load_in_8bit,
+            cpu_only=cpu_only,
         )
     else:
         logger.info("Generating using finetuned model...")
         model = load_pretrained_model(
             ft_model_dir,
             load_in_4bit=load_in_4bit,
             load_in_8bit=load_in_8bit,
+            cpu_only=cpu_only,
         )
 
     model.eval()
+    
+    # Adjust generation parameters for CPU
+    max_new_tokens = 30 if cpu_only else 100
+    
+    # Preemptively disable use_cache for Phi models on CPU to avoid 'get_max_length' error
+    is_phi_model = "phi" in base_model_id.lower()
+    use_cache = not (is_phi_model and cpu_only)
+    
+    if not use_cache:
+        logger.info("Preemptively disabling KV cache for Phi model on CPU")
+        if hasattr(model.config, "use_cache"):
+            model.config.use_cache = False
+        
     with torch.no_grad():
-        predictions = model.generate(
-            input_ids=tokenized_train_dataset["input_ids"],
-            attention_mask=tokenized_train_dataset["attention_mask"],
-            max_new_tokens=100,
-            pad_token_id=2,
-        )
+        try:
+            # Move inputs to the same device as the model
+            device = next(model.parameters()).device
+            input_ids = tokenized_train_dataset["input_ids"].to(device)
+            attention_mask = tokenized_train_dataset["attention_mask"].to(device)
+            
+            # Generate with appropriate parameters
+            logger.info(f"Generating with use_cache={use_cache}")
+            predictions = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
+                pad_token_id=2,
+                use_cache=use_cache,  # Use the preemptively determined setting
+                do_sample=False  # Use greedy decoding for more stable results on CPU
+            )
+        except (AttributeError, RuntimeError) as e:
+            logger.warning(f"Initial generation attempt failed with error: {str(e)}")
+            
+            # First fallback: try with more safety settings
+            if "get_max_length" in str(e) or "DynamicCache" in str(e) or cpu_only:
+                logger.warning("Using fallback generation strategy with minimal parameters")
+                try:
+                    # Force model to CPU if needed
+                    if not str(next(model.parameters()).device) == "cpu":
+                        logger.info("Moving model to CPU for generation")
+                        model = model.to("cpu")
+                    
+                    # Move inputs to CPU
+                    input_ids = tokenized_train_dataset["input_ids"].to("cpu")
+                    attention_mask = tokenized_train_dataset["attention_mask"].to("cpu")
+                    
+                    predictions = model.generate(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        max_new_tokens=20,  # Even smaller for safety
+                        pad_token_id=2,
+                        use_cache=False,  # Disable KV caching completely
+                        do_sample=False,  # Use greedy decoding
+                        num_beams=1  # Simple beam search
+                    )
+                except (RuntimeError, Exception) as e2:
+                    logger.warning(f"Second generation attempt failed with error: {str(e2)}")
+                    
+                    # Final fallback: process one sample at a time
+                    logger.warning("Final fallback: processing one sample at a time")
+                    
+                    # Process one sample at a time
+                    all_predictions = []
+                    batch_size = tokenized_train_dataset["input_ids"].shape[0]
+                    
+                    for i in range(batch_size):
+                        try:
+                            # Process one sample at a time
+                            single_input = tokenized_train_dataset["input_ids"][i:i+1].to("cpu")
+                            single_attention = tokenized_train_dataset["attention_mask"][i:i+1].to("cpu")
+                            
+                            single_pred = model.generate(
+                                input_ids=single_input,
+                                attention_mask=single_attention,
+                                max_new_tokens=20,  # Even further reduced for safety
+                                num_beams=1,
+                                do_sample=False,
+                                use_cache=False,
+                                pad_token_id=2,
+                            )
+                            all_predictions.append(single_pred)
+                        except Exception as sample_error:
+                            logger.error(f"Failed to generate for sample {i}: {str(sample_error)}")
+                            # Create an empty prediction as placeholder
+                            all_predictions.append(tokenized_train_dataset["input_ids"][i:i+1])
+                    
+                    # Combine the individual predictions
+                    if all_predictions:
+                        predictions = torch.cat(all_predictions, dim=0)
+                    else:
+                        # If all samples failed, return original inputs
+                        logger.error("All samples failed in generation. Using inputs as fallback.")
+                        predictions = tokenized_train_dataset["input_ids"]
+            else:
+                # Re-raise if not a cache-related issue
+                raise e
     predictions = tokenizer.batch_decode(
         predictions[:, tokenized_train_dataset["input_ids"].shape[1] :],
         skip_special_tokens=True,