fix

codelion · codelion · commit 102d41966e25 · 2025-08-15T11:24:44.000+08:00
diff --git a/README.md b/README.md
@@ -377,6 +377,29 @@ database:
     correctness: 15       # 15 bins for correctness (from YOUR evaluator)
 ```
 
+**CRITICAL: Return Raw Values, Not Bin Indices**: For custom feature dimensions, your evaluator must return **raw continuous values**, not pre-computed bin indices. OpenEvolve handles all scaling and binning internally.
+
+```python
+# ✅ CORRECT: Return raw values
+return {
+    "combined_score": 0.85,
+    "prompt_length": 1247,     # Actual character count
+    "execution_time": 0.234    # Raw time in seconds
+}
+
+# ❌ WRONG: Don't return bin indices
+return {
+    "combined_score": 0.85,
+    "prompt_length": 7,        # Pre-computed bin index
+    "execution_time": 3        # Pre-computed bin index
+}
+```
+
+OpenEvolve automatically handles:
+- Min-max scaling to [0,1] range
+- Binning into the specified number of bins  
+- Adaptive scaling as the value range expands during evolution
+
 **Important**: OpenEvolve will raise an error if a specified feature is not found in the evaluator's metrics. This ensures your configuration is correct. The error message will show available metrics to help you fix the configuration.
 
 See the [Configuration Guide](configs/default_config.yaml) for a full list of options.
diff --git a/examples/README.md b/examples/README.md
@@ -133,6 +133,56 @@ log_level: "INFO"
 ❌ **Wrong:** Multiple EVOLVE-BLOCK sections  
 ✅ **Correct:** Exactly one EVOLVE-BLOCK section
 
+## MAP-Elites Feature Dimensions Best Practices
+
+When using custom feature dimensions, your evaluator must return **raw continuous values**, not pre-computed bin indices:
+
+### ✅ Correct: Return Raw Values
+```python
+def evaluate(program_path: str) -> Dict:
+    # Calculate actual measurements
+    prompt_length = len(generated_prompt)  # Actual character count
+    execution_time = measure_runtime()     # Time in seconds
+    memory_usage = get_peak_memory()       # Bytes used
+    
+    return {
+        "combined_score": accuracy_score,
+        "prompt_length": prompt_length,    # Raw count, not bin index
+        "execution_time": execution_time,  # Raw seconds, not bin index  
+        "memory_usage": memory_usage       # Raw bytes, not bin index
+    }
+```
+
+### ❌ Wrong: Return Bin Indices
+```python
+def evaluate(program_path: str) -> Dict:
+    prompt_length = len(generated_prompt)
+    
+    # DON'T DO THIS - pre-computing bins
+    if prompt_length < 100:
+        length_bin = 0
+    elif prompt_length < 500:
+        length_bin = 1
+    # ... more binning logic
+    
+    return {
+        "combined_score": accuracy_score,
+        "prompt_length": length_bin,  # ❌ This is a bin index, not raw value
+    }
+```
+
+### Why This Matters
+- OpenEvolve uses min-max scaling internally
+- Bin indices get incorrectly scaled as if they were raw values
+- Grid positions become unstable as new programs change the min/max range
+- This violates MAP-Elites principles and leads to poor evolution
+
+### Examples of Good Feature Dimensions
+- **Counts**: Token count, line count, character count
+- **Performance**: Execution time, memory usage, throughput
+- **Quality**: Accuracy, precision, recall, F1 score  
+- **Complexity**: Cyclomatic complexity, nesting depth, function count
+
 ## Running Your Example
 
 ```bash
diff --git a/examples/llm_prompt_optimization/evaluator.py b/examples/llm_prompt_optimization/evaluator.py
@@ -61,38 +61,28 @@
 
 def calculate_prompt_features(prompt):
     """
-    Calculate custom features for MAP-Elites binning
+    Calculate custom features for MAP-Elites
+
+    IMPORTANT: Returns raw continuous values, not bin indices.
+    The database handles all scaling and binning automatically.
 
     Returns:
-        tuple: (prompt_length, reasoning_strategy) - both in range 0-9
+        tuple: (prompt_length, reasoning_sophistication_score)
+        - prompt_length: Actual character count
+        - reasoning_sophistication_score: Continuous score 0.0-1.0
     """
-    # Feature 1: Prompt length bin (0-9)
-    length = len(prompt)
-    if length < 100:
-        prompt_length = 0  # Minimal
-    elif length < 200:
-        prompt_length = 1  # Very short
-    elif length < 400:
-        prompt_length = 2  # Short
-    elif length < 600:
-        prompt_length = 3  # Medium-short
-    elif length < 900:
-        prompt_length = 4  # Medium
-    elif length < 1200:
-        prompt_length = 5  # Medium-long
-    elif length < 1600:
-        prompt_length = 6  # Long
-    elif length < 2000:
-        prompt_length = 7  # Very long
-    elif length < 2500:
-        prompt_length = 8  # Extensive
-    else:
-        prompt_length = 9  # Very extensive
+    # Feature 1: Prompt length (raw character count)
+    prompt_length = len(prompt)
 
-    # Feature 2: Reasoning strategy (0-9)
+    # Feature 2: Reasoning sophistication score (continuous 0.0-1.0)
     prompt_lower = prompt.lower()
+    sophistication_score = 0.0
+
+    # Base scoring
+    if len(prompt) >= 100:
+        sophistication_score += 0.1  # Has substantial content
 
-    # Check for few-shot examples
+    # Check for few-shot examples (high sophistication)
     has_example = (
         "example" in prompt_lower
         or prompt.count("####") >= 4
@@ -107,33 +97,40 @@ def calculate_prompt_features(prompt):
         or bool(re.search(r"(first|then|next|finally)", prompt_lower))
     )
 
-    # Assign reasoning strategy bins
+    # Check for directive language
+    has_directive = "solve" in prompt_lower or "calculate" in prompt_lower
+
+    # Check for strict language
+    has_strict = "must" in prompt_lower or "exactly" in prompt_lower
+
+    # Calculate sophistication score
     if has_example:
-        # Few-shot examples (bins 7-9)
+        sophistication_score += 0.6  # Few-shot examples are sophisticated
         if has_cot:
-            reasoning_strategy = 9  # Few-shot + CoT (most sophisticated)
-        elif length > 1500:
-            reasoning_strategy = 8  # Extensive few-shot
+            sophistication_score += 0.3  # Few-shot + CoT is most sophisticated
+        elif len(prompt) > 1500:
+            sophistication_score += 0.2  # Extensive few-shot
         else:
-            reasoning_strategy = 7  # Basic few-shot
+            sophistication_score += 0.1  # Basic few-shot
     elif has_cot:
-        # Chain-of-thought (bins 4-6)
-        if "must" in prompt_lower or "exactly" in prompt_lower:
-            reasoning_strategy = 6  # Strict CoT
-        elif length > 500:
-            reasoning_strategy = 5  # Detailed CoT
+        sophistication_score += 0.4  # Chain-of-thought
+        if has_strict:
+            sophistication_score += 0.2  # Strict CoT
+        elif len(prompt) > 500:
+            sophistication_score += 0.15  # Detailed CoT
         else:
-            reasoning_strategy = 4  # Basic CoT
+            sophistication_score += 0.1  # Basic CoT
     else:
-        # Basic prompts (bins 0-3)
-        if length < 100:
-            reasoning_strategy = 0  # Minimal
-        elif "solve" in prompt_lower or "calculate" in prompt_lower:
-            reasoning_strategy = 2  # Direct instruction
+        # Basic prompts
+        if has_directive:
+            sophistication_score += 0.2  # Direct instruction
         else:
-            reasoning_strategy = 1  # Simple prompt
+            sophistication_score += 0.1  # Simple prompt
+
+    # Ensure score is within 0.0-1.0 range
+    sophistication_score = min(1.0, max(0.0, sophistication_score))
 
-    return prompt_length, reasoning_strategy
+    return prompt_length, sophistication_score
 
 
 def load_prompt_config(prompt_path):
@@ -492,13 +489,15 @@ def evaluate_stage1(prompt_path):
         print("-" * 80)
 
         # Calculate custom features
-        prompt_length, reasoning_strategy = calculate_prompt_features(prompt)
-        print(f"Prompt features - Length bin: {prompt_length}, Reasoning bin: {reasoning_strategy}")
+        prompt_length, reasoning_sophistication = calculate_prompt_features(prompt)
+        print(
+            f"Prompt features - Length: {prompt_length} chars, Reasoning sophistication: {reasoning_sophistication:.3f}"
+        )
 
         return {
             "combined_score": accuracy,
             "prompt_length": prompt_length,
-            "reasoning_strategy": reasoning_strategy,
+            "reasoning_strategy": reasoning_sophistication,
         }
 
     except Exception as e:
@@ -511,15 +510,15 @@ def evaluate_stage1(prompt_path):
             # Try to calculate features from the failed prompt
             with open(prompt_path, "r") as f:
                 failed_prompt = f.read().strip()
-            prompt_length, reasoning_strategy = calculate_prompt_features(failed_prompt)
+            prompt_length, reasoning_sophistication = calculate_prompt_features(failed_prompt)
         except:
             # Fallback values if prompt can't be read
-            prompt_length, reasoning_strategy = 0, 0
+            prompt_length, reasoning_sophistication = 0, 0.0
 
         return {
             "combined_score": 0.0,
             "prompt_length": prompt_length,
-            "reasoning_strategy": reasoning_strategy,
+            "reasoning_strategy": reasoning_sophistication,
             "error": str(e),
         }
 
@@ -560,13 +559,15 @@ def evaluate_stage2(prompt_path):
         print("-" * 80)
 
         # Calculate custom features
-        prompt_length, reasoning_strategy = calculate_prompt_features(prompt)
-        print(f"Prompt features - Length bin: {prompt_length}, Reasoning bin: {reasoning_strategy}")
+        prompt_length, reasoning_sophistication = calculate_prompt_features(prompt)
+        print(
+            f"Prompt features - Length: {prompt_length} chars, Reasoning sophistication: {reasoning_sophistication:.3f}"
+        )
 
         return {
             "combined_score": accuracy,
             "prompt_length": prompt_length,
-            "reasoning_strategy": reasoning_strategy,
+            "reasoning_strategy": reasoning_sophistication,
         }
 
     except Exception as e:
@@ -579,15 +580,15 @@ def evaluate_stage2(prompt_path):
             # Try to calculate features from the failed prompt
             with open(prompt_path, "r") as f:
                 failed_prompt = f.read().strip()
-            prompt_length, reasoning_strategy = calculate_prompt_features(failed_prompt)
+            prompt_length, reasoning_sophistication = calculate_prompt_features(failed_prompt)
         except:
             # Fallback values if prompt can't be read
-            prompt_length, reasoning_strategy = 0, 0
+            prompt_length, reasoning_sophistication = 0, 0.0
 
         return {
             "combined_score": 0.0,
             "prompt_length": prompt_length,
-            "reasoning_strategy": reasoning_strategy,
+            "reasoning_strategy": reasoning_sophistication,
             "error": str(e),
         }
 
diff --git a/openevolve/config.py b/openevolve/config.py
@@ -186,7 +186,19 @@ class DatabaseConfig:
 
     # Feature map dimensions for MAP-Elites
     # Default to complexity and diversity for better exploration
-    feature_dimensions: List[str] = field(default_factory=lambda: ["complexity", "diversity"])
+    # CRITICAL: For custom dimensions, evaluators must return RAW VALUES, not bin indices
+    # Built-in: "complexity", "diversity", "score" (always available)
+    # Custom: Any metric from your evaluator (must be continuous values)
+    feature_dimensions: List[str] = field(
+        default_factory=lambda: ["complexity", "diversity"],
+        metadata={
+            "help": "List of feature dimensions for MAP-Elites grid. "
+                   "Built-in dimensions: 'complexity', 'diversity', 'score'. "
+                   "Custom dimensions: Must match metric names from evaluator. "
+                   "IMPORTANT: Evaluators must return raw continuous values for custom dimensions, "
+                   "NOT pre-computed bin indices. OpenEvolve handles all scaling and binning internally."
+        }
+    )
     feature_bins: Union[int, Dict[str, int]] = 10  # Can be int (all dims) or dict (per-dim)
     diversity_reference_size: int = 20  # Size of reference set for diversity calculation
 
diff --git a/openevolve/evaluation_result.py b/openevolve/evaluation_result.py
@@ -14,6 +14,14 @@ class EvaluationResult:
 
     This maintains backward compatibility with the existing dict[str, float] contract
     while adding a side-channel for arbitrary artifacts (text or binary data).
+
+    IMPORTANT: For custom MAP-Elites features, metrics values must be raw continuous 
+    scores (e.g., actual counts, percentages, continuous measurements), NOT pre-computed 
+    bin indices. The database handles all binning internally using min-max scaling.
+
+    Examples:
+        ✅ Correct: {"combined_score": 0.85, "prompt_length": 1247, "execution_time": 0.234}
+        ❌ Wrong:   {"combined_score": 0.85, "prompt_length": 7, "execution_time": 3}
     """
 
     metrics: Dict[str, float]  # mandatory - existing contract