|
51 | 51 | DATASET_CONFIG_PATH = os.path.join(evaluator_dir, dataset_filename) |
52 | 52 | print(f"Dataset configuration: {dataset_filename}") |
53 | 53 |
|
| 54 | + |
| 55 | +def calculate_prompt_features(prompt): |
| 56 | + """ |
| 57 | + Calculate custom features for MAP-Elites binning |
| 58 | + |
| 59 | + Returns: |
| 60 | + tuple: (prompt_length, reasoning_strategy) - both in range 0-9 |
| 61 | + """ |
| 62 | + # Feature 1: Prompt length bin (0-9) |
| 63 | + length = len(prompt) |
| 64 | + if length < 100: |
| 65 | + prompt_length = 0 # Minimal |
| 66 | + elif length < 200: |
| 67 | + prompt_length = 1 # Very short |
| 68 | + elif length < 400: |
| 69 | + prompt_length = 2 # Short |
| 70 | + elif length < 600: |
| 71 | + prompt_length = 3 # Medium-short |
| 72 | + elif length < 900: |
| 73 | + prompt_length = 4 # Medium |
| 74 | + elif length < 1200: |
| 75 | + prompt_length = 5 # Medium-long |
| 76 | + elif length < 1600: |
| 77 | + prompt_length = 6 # Long |
| 78 | + elif length < 2000: |
| 79 | + prompt_length = 7 # Very long |
| 80 | + elif length < 2500: |
| 81 | + prompt_length = 8 # Extensive |
| 82 | + else: |
| 83 | + prompt_length = 9 # Very extensive |
| 84 | + |
| 85 | + # Feature 2: Reasoning strategy (0-9) |
| 86 | + prompt_lower = prompt.lower() |
| 87 | + |
| 88 | + # Check for few-shot examples |
| 89 | + has_example = ('example' in prompt_lower or |
| 90 | + prompt.count('####') >= 4 or |
| 91 | + bool(re.search(r'problem:.*?solution:', prompt_lower, re.DOTALL))) |
| 92 | + |
| 93 | + # Check for Chain-of-Thought (CoT) indicators |
| 94 | + has_cot = ('step by step' in prompt_lower or |
| 95 | + 'step-by-step' in prompt_lower or |
| 96 | + any(phrase in prompt_lower for phrase in ['think through', 'reasoning', 'explain your']) or |
| 97 | + bool(re.search(r'(first|then|next|finally)', prompt_lower))) |
| 98 | + |
| 99 | + # Assign reasoning strategy bins |
| 100 | + if has_example: |
| 101 | + # Few-shot examples (bins 7-9) |
| 102 | + if has_cot: |
| 103 | + reasoning_strategy = 9 # Few-shot + CoT (most sophisticated) |
| 104 | + elif length > 1500: |
| 105 | + reasoning_strategy = 8 # Extensive few-shot |
| 106 | + else: |
| 107 | + reasoning_strategy = 7 # Basic few-shot |
| 108 | + elif has_cot: |
| 109 | + # Chain-of-thought (bins 4-6) |
| 110 | + if 'must' in prompt_lower or 'exactly' in prompt_lower: |
| 111 | + reasoning_strategy = 6 # Strict CoT |
| 112 | + elif length > 500: |
| 113 | + reasoning_strategy = 5 # Detailed CoT |
| 114 | + else: |
| 115 | + reasoning_strategy = 4 # Basic CoT |
| 116 | + else: |
| 117 | + # Basic prompts (bins 0-3) |
| 118 | + if length < 100: |
| 119 | + reasoning_strategy = 0 # Minimal |
| 120 | + elif 'solve' in prompt_lower or 'calculate' in prompt_lower: |
| 121 | + reasoning_strategy = 2 # Direct instruction |
| 122 | + else: |
| 123 | + reasoning_strategy = 1 # Simple prompt |
| 124 | + |
| 125 | + return prompt_length, reasoning_strategy |
| 126 | + |
| 127 | + |
54 | 128 | def load_prompt_config(prompt_path): |
55 | 129 | """Load the prompt from text file and dataset config from matching _dataset.yaml file.""" |
56 | 130 | # Load prompt from text file |
@@ -280,8 +354,14 @@ def evaluate_stage1(prompt_path): |
280 | 354 | print(f"Stage 1 accuracy: {accuracy:.3f} ({correct}/{total})") |
281 | 355 | print('-' * 80) |
282 | 356 |
|
| 357 | + # Calculate custom features |
| 358 | + prompt_length, reasoning_strategy = calculate_prompt_features(prompt) |
| 359 | + print(f"Prompt features - Length bin: {prompt_length}, Reasoning bin: {reasoning_strategy}") |
| 360 | + |
283 | 361 | return { |
284 | | - "combined_score": accuracy |
| 362 | + "combined_score": accuracy, |
| 363 | + "prompt_length": prompt_length, |
| 364 | + "reasoning_strategy": reasoning_strategy |
285 | 365 | } |
286 | 366 |
|
287 | 367 | except Exception as e: |
@@ -329,8 +409,14 @@ def evaluate_stage2(prompt_path): |
329 | 409 | print(f"Stage 2 accuracy: {accuracy:.3f} ({correct}/{total})") |
330 | 410 | print('-' * 80) |
331 | 411 |
|
| 412 | + # Calculate custom features |
| 413 | + prompt_length, reasoning_strategy = calculate_prompt_features(prompt) |
| 414 | + print(f"Prompt features - Length bin: {prompt_length}, Reasoning bin: {reasoning_strategy}") |
| 415 | + |
332 | 416 | return { |
333 | | - "combined_score": accuracy |
| 417 | + "combined_score": accuracy, |
| 418 | + "prompt_length": prompt_length, |
| 419 | + "reasoning_strategy": reasoning_strategy |
334 | 420 | } |
335 | 421 |
|
336 | 422 | except Exception as e: |
|
0 commit comments