Add GSM8K prompt and dataset support for math tasks

codelion · codelion · commit 32158b7c0452 · 2025-07-30T14:21:49.000+08:00
Introduces GSM8K prompt and dataset configuration files for grade school math problem evaluation. Updates evaluator.py to support GSM8K answer extraction and adjusts evaluation logic for numeric answers. Modifies config.yaml for new optimal parameters and documents GSM8K support in the README.
diff --git a/examples/llm_prompt_optimization/README.md b/examples/llm_prompt_optimization/README.md
@@ -95,6 +95,7 @@ This optimizer works with any HuggingFace dataset. Included examples:
 
 - **IMDB Sentiment**: `initial_prompt.txt` + `initial_prompt_dataset.yaml` (binary classification)
 - **Emotion**: `emotion_prompt.txt` + `emotion_prompt_dataset.yaml` (6-class, benchmark against DSPy)
+- **GSM8K**: `gsm8k_prompt.txt` + `gsm8k_prompt_dataset.yaml` (grade school math, DSPy achieves 97.1%)
 
 ### Creating New Tasks
 
diff --git a/examples/llm_prompt_optimization/config.yaml b/examples/llm_prompt_optimization/config.yaml
@@ -16,7 +16,7 @@ llm:
     - name: "gemini-2.5-flash-lite"  # Using Gemini 2.5 Flash Lite
       weight: 1.0
   
-  temperature: 0.4  # Optimal from experiments
+  temperature: 0.8  # Optimal from experiments
   max_tokens: 16000  # Optimal context
   timeout: 150
   retries: 3
@@ -53,17 +53,17 @@ database:
   
   # Selection parameters - Optimal ratios from testing
   elite_selection_ratio: 0.1   # 10% elite selection
-  exploration_ratio: 0.3       # 30% exploration
-  exploitation_ratio: 0.6      # 60% exploitation
+  exploration_ratio: 0.5       # 30% exploration
+  exploitation_ratio: 0.4      # 60% exploitation
   
   # Migration parameters - Optimal settings
   migration_interval: 10
   migration_rate: 0.1
 
 # Evaluator Configuration
 evaluator:
-  timeout: 200
+  timeout: 600
   max_retries: 3
   parallel_evaluations: 4
   cascade_evaluation: true  # Two-stage cascading evaluation
-  cascade_thresholds: [0.9]  # Stage 1 must achieve 90% accuracy to proceed to stage 2
+  cascade_thresholds: [0.4]  # Stage 1 must achieve 90% accuracy to proceed to stage 2
diff --git a/examples/llm_prompt_optimization/evaluator.py b/examples/llm_prompt_optimization/evaluator.py
@@ -69,17 +69,24 @@ def load_prompt_config(prompt_path):
 def load_hf_dataset(config):
     """Load HuggingFace dataset based on configuration."""
     dataset_name = config['dataset_name']
+    dataset_config = config.get('dataset_config', None)
     split = config.get('split', 'test')
     
     print(f"Loading dataset: {dataset_name}")
     
     try:
         # Try to load the specified split
-        dataset = load_dataset(dataset_name, split=split)
+        if dataset_config:
+            dataset = load_dataset(dataset_name, dataset_config, split=split)
+        else:
+            dataset = load_dataset(dataset_name, split=split)
     except:
         # Fallback to train split if test is not available
         print(f"Split '{split}' not found, falling back to 'train'")
-        dataset = load_dataset(dataset_name, split='train')
+        if dataset_config:
+            dataset = load_dataset(dataset_name, dataset_config, split='train')
+        else:
+            dataset = load_dataset(dataset_name, split='train')
     
     print(f"Dataset loaded with {len(dataset)} examples")
     return dataset
@@ -89,8 +96,10 @@ def evaluate_prompt(prompt, dataset, config, num_samples):
     input_field = config['input_field']
     target_field = config['target_field']
     
-    # Check if this is emotion classification (0-5) or sentiment (0-1)
-    is_emotion = 'emotion' in config.get('dataset_name', '').lower()
+    # Check dataset type
+    dataset_name = config.get('dataset_name', '').lower()
+    is_emotion = 'emotion' in dataset_name
+    is_gsm8k = 'gsm8k' in dataset_name
     
     # Sample from dataset
     samples = dataset.select(range(min(num_samples, len(dataset))))
@@ -110,11 +119,14 @@ def evaluate_prompt(prompt, dataset, config, num_samples):
         # Call the LLM with retry logic
         for attempt in range(MAX_RETRIES):
             try:
+                # Adjust max_tokens based on task
+                max_tokens = 500 if is_gsm8k else 20
+                
                 response = test_model.chat.completions.create(
                     model=TASK_MODEL_NAME,
                     messages=messages,
-                    temperature=0.1,  # Low temperature for consistent classification
-                    max_tokens=20  # Allow slightly more tokens for emotion labels
+                    temperature=0.1,  # Low temperature for consistent results
+                    max_tokens=max_tokens
                 )
                 break
             except Exception as e:
@@ -150,7 +162,41 @@ def evaluate_prompt(prompt, dataset, config, num_samples):
         
         # Extract prediction from output
         try:
-            if is_emotion:
+            if is_gsm8k:
+                # For GSM8K, extract the numeric answer after ####
+                # First, extract the expected answer from the ground truth
+                expected_answer = expected.split('####')[-1].strip()
+                try:
+                    expected_number = float(expected_answer.replace(',', ''))
+                except:
+                    print(f"Warning: Could not parse expected answer: {expected_answer}")
+                    total += 1
+                    continue
+                
+                # Extract prediction from model output
+                prediction = None
+                if '####' in output_text:
+                    predicted_answer = output_text.split('####')[-1].strip()
+                    # Extract just the number, removing any extra text like $ signs
+                    import re
+                    numbers = re.findall(r'-?\$?[\d,]+\.?\d*', predicted_answer)
+                    if numbers:
+                        try:
+                            # Remove $ and , from the number
+                            number_str = numbers[0].replace('$', '').replace(',', '')
+                            prediction = float(number_str)
+                        except:
+                            pass
+                
+                # If we found a prediction, check if it matches
+                if prediction is not None:
+                    # Check if answers match (with small tolerance for floats)
+                    if abs(prediction - expected_number) < 0.001:
+                        correct += 1
+                
+                total += 1
+                
+            elif is_emotion:
                 # For emotion classification (0-5)
                 numbers = re.findall(r'\b[0-5]\b', output_text)
                 if numbers:
diff --git a/examples/llm_prompt_optimization/gsm8k_prompt.txt b/examples/llm_prompt_optimization/gsm8k_prompt.txt
@@ -0,0 +1,5 @@
+Solve the following grade school math problem step by step.
+
+Problem: {input_text}
+
+Show your work and reasoning for each step. After solving, provide your final numeric answer after "####".
diff --git a/examples/llm_prompt_optimization/gsm8k_prompt_dataset.yaml b/examples/llm_prompt_optimization/gsm8k_prompt_dataset.yaml
@@ -0,0 +1,14 @@
+# HuggingFace dataset configuration for GSM8K (Grade School Math)
+# DSPy achieved 97.1% accuracy with GPT-4 on this benchmark
+dataset_name: "openai/gsm8k"
+dataset_config: "main"  # GSM8K requires config name
+input_field: "question"
+target_field: "answer"  # Contains step-by-step solution ending with #### followed by the numeric answer
+split: "test"
+
+# Evaluation samples
+max_samples: 200  # Start with subset, full test set has 1,319 problems
+
+# Note: The answer field contains the full solution with the format:
+# "Step 1 explanation... Step 2... #### numeric_answer"
+# The evaluator will need to extract the number after ####