rStar/variation_generator.py at main · Stonej29/rStar · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
import argparse
import json
import random
import re

def main():
    parser = argparse.ArgumentParser(description="Generate variations for templates.")
    parser.add_argument('--input', type=str, required=True, help="Input JSON template file")
    parser.add_argument('--output', type=str, required=True, help="Output JSON file")
    parser.add_argument('--type', type=str, choices=['name', 'numbers', 'combined', 'irrelevant'], required=True, help="Type of variation: 'name', 'numbers', 'combined', or 'irrelevant'")
    parser.add_argument('--num_variations', type=int, required=True, help="Number of variations per template")

    args = parser.parse_args()

    with open(args.input, 'r') as f:
        templates = json.load(f)

    # Generate variations
    variations = []
    for template in templates:
        num_generated = 0
        attempts = 0
        max_attempts = args.num_variations * 1000
        while num_generated < args.num_variations and attempts < max_attempts:
            variation = generate_variation(template, args.type)
            attempts += 1
            if variation is not None:
                variations.append(variation)
                num_generated += 1
            else:
                pass
        if num_generated < args.num_variations:
            print(f"Only generated {num_generated} variations for template id {template.get('id')} after {attempts} attempts.")

    with open(args.output, 'w') as f:
        json.dump(variations, f, indent=4)

def generate_variation(template, variation_type):
    variation = template.copy()

    numerical_context = {}
    name_context = {}
    if variation_type == 'name':
        # Use default numerical values
        numerical_context = get_numerical_context(variation, default=True)
        if numerical_context is None:
            return None
        # Replace name variables with random names
        name_context = get_name_context(variation, default=False)
        if name_context is None:
            return None
    elif variation_type == 'numbers':
        # Generate random numerical values within ranges
        numerical_context = get_numerical_context(variation, default=False)
        if numerical_context is None:
            return None
        # Use default names
        name_context = get_name_context(variation, default=True)
        if name_context is None:
            return None
    elif variation_type == 'combined':
        # Generate random numerical values within ranges
        numerical_context = get_numerical_context(variation, default=False)
        if numerical_context is None:
            return None
        # Replace name variables with random names
        name_context = get_name_context(variation, default=False)
        if name_context is None:
            return None
    elif variation_type == 'irrelevant':
        # Use default numerical values
        numerical_context = get_numerical_context(variation, default=True)
        if numerical_context is None:
            return None
        # Use default names
        name_context = get_name_context(variation, default=True)
        if name_context is None:
            return None
    else:
        raise ValueError("Invalid variation type")

    context = {}
    context.update(numerical_context)
    context.update(name_context)

    try:
        variation['problem'] = variation['problem'].format(**context)
        variation['solution'] = variation['solution'].format(**context)
    except KeyError as e:
        print(f"Variable {e} not found when formatting")
        return None

    variation = evaluate_expressions(variation, context)

    # If type is 'irrelevant', insert an irrelevant sentence into the problem
    if variation_type == 'irrelevant':
        variation['problem'] = insert_irrelevant_sentence(variation['problem'])

    if not final_answer_is_integer(variation['solution']):
        return None  # Discard and try again

    variation = {
        "id": variation.get('id'),
        "problem": variation.get('problem'),
        "solution": variation.get('solution')
    }

    return variation

def get_numerical_context(variation, default=False):
    variables = variation.get('variables', {}).get('numerical', {})
    calculations = variation.get('calculations', {})
    conditions = variation.get('conditions', {})

    if default:
        default_values = variation.get('default_values', {})
        assigned_vars = {}
        for var in variables:
            default_var_name = var + '_default'
            if default_var_name in default_values:
                assigned_vars[var] = default_values[default_var_name]
            else:
                print(f"Default value for numerical variable {var} not found")
                return None
        # Use default calculated variables if any
        context = assigned_vars.copy()
        try:
            for var, expr in calculations.items():
                context[var] = eval_expr(expr, context)
        except Exception as e:
            print(f"Error in calculation of {var}: {e}")
            return None
    else:
        # Randomly assign values to numerical variables within their ranges, satisfying conditions
        max_attempts = 1000  # Increase max_attempts to allow more tries
        for attempt in range(max_attempts):
            assigned_vars = {}
            for var, range_ in variables.items():
                if isinstance(range_, list) and len(range_) == 2:
                    low, high = range_
                    if all(isinstance(n, int) for n in range_):
                        assigned_vars[var] = random.randint(low, high)
                    else:
                        assigned_vars[var] = round(random.uniform(low, high), 2)
                else:
                    print(f"Invalid range for variable {var}")
                    return None
            # Compute calculations
            calc_vars = {}
            context = assigned_vars.copy()
            try:
                for var, expr in calculations.items():
                    calc_vars[var] = eval_expr(expr, context)
                    context[var] = calc_vars[var]
            except Exception as e:
                continue
            # Check conditions
            conditions_met = True
            for var, range_ in conditions.items():
                value = context.get(var)
                if value is None:
                    conditions_met = False
                    break
                if isinstance(range_, list) and len(range_) == 2:
                    if not (range_[0] <= value <= range_[1]):
                        conditions_met = False
                        break
                else:
                    print(f"Invalid condition for variable {var}")
                    conditions_met = False
                    break
            if conditions_met:
                break
        else:
            # Max attempts reached
            return None

        assigned_vars.update(calc_vars)
        context = assigned_vars
    return context

def is_integer(value):
    return isinstance(value, int) or (isinstance(value, float) and value.is_integer())

def eval_expr(expr, context):
    # Evaluate the expression using the context as local variables
    try:
        value = eval(expr, {}, context)
        # If value is float but represents an integer, convert it
        if isinstance(value, float) and value.is_integer():
            value = int(value)
    except Exception as e:
        return None
    return value

def get_name_context(variation, default=False):
    male_names = ["James", "John", "Robert", "Michael", "William", "David", "Richard", "Joseph", "Charles", "Thomas", "Daniel", "Matthew", "Anthony", "Mark", "Donald", "Steven", "Paul", "Andrew", "Joshua", "Kevin"]
    female_names = ["Mary", "Patricia", "Jennifer", "Elizabeth", "Linda", "Barbara", "Susan", "Margaret", "Jessica", "Sarah", "Karen", "Nancy", "Lisa", "Betty", "Sandra", "Ashley", "Dorothy", "Kimberly", "Emily", "Donna"]

    # Find all name placeholders in problem and solution
    text = variation.get('problem', '') + ' ' + variation.get('solution', '')
    name_placeholders = re.findall(r"{name_[mf]_\d+}", text)
    name_placeholders = list(set(name_placeholders))

    name_mapping = {}
    for placeholder in name_placeholders:
        var_name = placeholder.strip('{}')
        if default:
            default_var_name = var_name + '_default'
            default_values = variation.get('default_values', {})
            if default_var_name in default_values:
                name_mapping[var_name] = default_values[default_var_name]
            else:
                print(f"Default value for name variable {var_name} not found")
                return None
        else:
            if placeholder.startswith('{name_m_'):
                name = random.choice(male_names)
            elif placeholder.startswith('{name_f_'):
                name = random.choice(female_names)
            else:
                continue  # not a name placeholder
            name_mapping[var_name] = name

    return name_mapping

def evaluate_expressions(variation, context):
    # Evaluate expressions inside << >> in the solution
    solution = variation['solution']
    # Evaluate expressions inside << >>
    pattern = r'<<([^>]*)>>'
    matches = re.findall(pattern, solution)
    for expr in matches:
        try:
            # Format the expression with context
            expr_formatted = expr.format(**context)
            # If '=' in expr_formatted, split into lhs and rhs
            if '=' in expr_formatted:
                lhs, rhs = expr_formatted.split('=')
                lhs = lhs.strip()
                rhs = rhs.strip()
                # Evaluate lhs using context
                lhs_value = eval(lhs, {}, context)
                # If lhs_value is float but represents an integer, convert it
                if isinstance(lhs_value, float) and lhs_value.is_integer():
                    lhs_value = int(lhs_value)
                evaluated_expr = f"{lhs}={lhs_value}"
            else:
                # No '=', just evaluate the expression
                lhs_value = eval(expr_formatted, {}, context)
                if isinstance(lhs_value, float) and lhs_value.is_integer():
                    lhs_value = int(lhs_value)
                evaluated_expr = str(lhs_value)
        except Exception as e:
            evaluated_expr = expr  # Keep as is

        solution = solution.replace(f"<<{expr}>>", f"<<{evaluated_expr}>>")
    variation['solution'] = solution
    return variation

def final_answer_is_integer(solution_text):
    # Extract the final answer after '####'
    final_answer_match = re.search(r'####\s*(.+)', solution_text)
    if final_answer_match:
        final_answer = final_answer_match.group(1).strip()
        # Remove any non-numeric characters
        final_answer_numeric = re.sub(r'[^\d\.\-]', '', final_answer)
        try:
            value = float(final_answer_numeric)
            return value.is_integer()
        except ValueError:
            return False
    else:
        return False

def insert_irrelevant_sentence(problem_text):
    irrelevant_sentences = [
        "The sun is approximately 149.6 million kilometers away from the Earth.",
        "There are 206 bones in the adult human body.",
        "Mount Everest is 8,848 meters tall.",
        "Light travels at a speed of about 299,792 kilometers per second.",
        "The Great Wall of China is over 21,000 kilometers long.",
        "A year on Mercury lasts 88 Earth days.",
        "The tallest building in the world is 828 meters high.",
        "The Earth's atmosphere is composed of 78% nitrogen.",
        "The average human heart beats about 100,000 times per day.",
        "There are 9.461 trillion kilometers in a light-year.",
        "Water boils at 100 degrees Celsius.",
        "The Pacific Ocean covers about 165 million square kilometers.",
        "The moon is about 384,400 kilometers away from Earth.",
        "There are 365 days in a non-leap year.",
        "The Eiffel Tower is 324 meters tall.",
        "An adult blue whale can weigh up to 150,000 kilograms.",
        "The human eye blinks approximately 4,200,000 times a year.",
        "There are about 7.8 billion people on Earth.",
        "A marathon is 42.195 kilometers long.",
        "The Amazon River is approximately 6,400 kilometers long."
    ]

    sentences = re.split('(?<=[.!?]) +', problem_text)
    # Choose a random position to insert the irrelevant sentence
    insert_position = random.randint(0, len(sentences))
    irrelevant_sentence = random.choice(irrelevant_sentences)
    sentences.insert(insert_position, irrelevant_sentence)
    # Reconstruct the problem text
    new_problem_text = ' '.join(sentences)
    return new_problem_text

if __name__ == "__main__":
    main()