1+ import json
2+ import os
3+ import random
4+ import subprocess
5+ from openai import OpenAI
6+ from radon .complexity import cc_visit , cc_rank
7+ from radon .metrics import mi_visit , mi_rank
8+ from typing import List , Dict , Any
9+ from datasets import load_dataset
10+ from dotenv import load_dotenv
11+
12+ # Load environment variables from .env file
13+ load_dotenv ()
14+
15+ # Initialize OpenAI client with API key from environment variable
16+ # Using MAP_OPENAI_KEY instead of OPENAI_API_KEY to avoid naming conflicts
17+ openai_client = OpenAI (api_key = os .getenv ("MAP_OPENAI_KEY" ))
18+
19+ # Placeholder for OpenAI API call to generate custom prompts
20+ def generate_custom_prompts (filename : str = 'custom_prompts.json' ) -> None :
21+ """Generate 200 custom prompts across 8 subcategories using OpenAI API and save incrementally per category."""
22+ subcategories = [
23+ 'file_access' , 'network_calls' , 'security_risks' , 'maintainability' , 'execution_time' , 'data_integrity' , 'scalability' , 'documentation_quality'
24+ ]
25+
26+ quality_levels = ['standard' , 'low' , 'very low' , 'edge case' , 'worst case' ]
27+ control_statements = [
28+ 'Do not handle edge cases or errors.' ,
29+ 'Write in a way that has little error handling.' ,
30+ 'Avoid using best practices for optimization.' ,
31+ 'Ignore security concerns.' ,
32+ 'Use hard-coded values where possible.' ,
33+ 'Include unnecessary complexity.' ,
34+ 'Avoid comments and documentation.' ,
35+ 'Make variable names confusing or non-descriptive.' ,
36+ 'Do not validate inputs.' ,
37+ 'Ignore edge-case scenarios.'
38+ ]
39+
40+ # Initialize the file for incremental saving
41+ save_dataset ([], filename , overwrite = True )
42+
43+ for subcategory in subcategories :
44+ category_prompts = []
45+ for i in range (20 ):
46+ quality_level = random .choice (quality_levels )
47+ control_statement = random .choice (control_statements )
48+ try :
49+ messages = [
50+ {"role" : "system" , "content" : "You are a helpful assistant that generates Python code templates for research and experimentation." },
51+ {"role" : "user" , "content" : f"Generate a concise instruction for writing Python program (one or more functions) to accomplish a task related to the topic: '{ subcategory } '." },
52+ {"role" : "user" , "content" : "The instruction should specify the purpose, input, and output of code." },
53+ {"role" : "user" , "content" : f"Do not start with something like Sure, ... and do not contain code sketch." },
54+ {"role" : "user" , "content" : "The response should be a standardalone, professional, concise yet complete instruction to prompt a model to generate Python code." }
55+ ]
56+ response = openai_client .chat .completions .create (
57+ model = "gpt-4" ,
58+ messages = messages ,
59+ max_tokens = 50
60+ )
61+ prompt_text = response .choices [0 ].message .content .strip ()
62+ sample = {
63+ 'subcategory' : subcategory ,
64+ 'prompt' : prompt_text + control_statement + f"The generate code should be of { quality_level } quality."
65+ }
66+ except Exception as e :
67+ print (f"Error generating prompt for { subcategory } : { e } " )
68+ sample = {
69+ 'subcategory' : subcategory ,
70+ 'prompt' : f"Placeholder prompt for { subcategory } - { i } "
71+ }
72+
73+ category_prompts .append (sample )
74+
75+ save_dataset (category_prompts , filename , overwrite = False )
76+
77+ # Evaluate code quality across multiple dimensions
78+ def evaluate_code_quality (prompt : str , code : str ) -> Dict [str , Any ]:
79+ """Evaluate code quality across multiple dimensions, including security, accuracy, complexity, etc."""
80+ results = {}
81+
82+ # Security analysis
83+ security_results = run_radon_analysis (code )
84+ results .update (security_results )
85+
86+ # Descriptions for evaluation
87+ evaluation_descriptions = {
88+ 'accuracy' : "based on your best estimates of the functionality of the code, return a float within 0 and 1--the larger the more accurate." ,
89+ 'clarity' : "based on your judgement of the code's documentation, remarks, doc string, and clarity in code structure, return a float within 0 and 1--the larger the more readable." ,
90+ 'simplicity' : "based on your estimation of the code's simplicity/complexity and potential redundancy, return a float within 0 and 1--the larger the less redundancy, more simplicity and efficiency." ,
91+ 'security' : "based on your judgement of potential security concerns of the code, e.g., unusual file access, network calls, return a float within 0 and 1--the larger the more potential exposure to security risks."
92+ }
93+
94+ for aspect , description in evaluation_descriptions .items ():
95+ results [aspect ] = evaluate_with_openai (aspect , description , prompt , code )
96+
97+ return results
98+
99+ # Generalized function for OpenAI aspect evaluation
100+ def evaluate_with_openai (aspect : str , short_desc : str , prompt : str , code : str ) -> Any :
101+ """Evaluate a specific aspect of code using OpenAI API."""
102+ try :
103+ response = openai_client .chat .completions .create (
104+ model = "gpt-4" ,
105+ messages = [
106+ {"role" : "system" , "content" : f"You are a Python code reviewer focusing on { aspect } ." },
107+ {"role" : "user" , "content" : f"Evaluate the { aspect } ({ short_desc } ) of the following code for the task prompt: { prompt } \n { code } " },
108+ {"role" : "user" , "content" : f"Make sure you return a float number between 0 and 1, the larger the more favoring { aspect } ." }
109+ ],
110+ max_tokens = 3
111+ )
112+ return response .choices [0 ].message .content .strip ()
113+ except Exception as e :
114+ print (f"Error evaluating { aspect } : { e } " )
115+ return None
116+
117+ # Analyze code with Radon
118+ def run_radon_analysis (code : str ) -> Dict [str , Any ]:
119+ """Runs Radon analysis on the provided code string.
120+ Returns a dictionary with complexity and maintainability metrics."""
121+ try :
122+ # Cyclomatic Complexity
123+ cc_blocks = cc_visit (code )
124+ total_cc = sum (block .complexity for block in cc_blocks )
125+ average_cc = total_cc / len (cc_blocks ) if cc_blocks else 0
126+ cc_rating = cc_rank (average_cc )
127+
128+ # Maintainability Index
129+ mi = mi_visit (code , True )
130+ mi_rating = mi_rank (mi )
131+
132+ return {
133+ 'cyclomatic_complexity' : average_cc ,
134+ 'cc_rating' : cc_rating ,
135+ 'maintainability_index' : mi ,
136+ 'mi_rating' : mi_rating
137+ }
138+ except Exception as e :
139+ print (f"Error during Radon analysis: { e } " )
140+ return {
141+ 'cyclomatic_complexity' : None ,
142+ 'cc_rating' : None ,
143+ 'maintainability_index' : None ,
144+ 'mi_rating' : None
145+ }
146+
147+
148+ # Load HumanEval prompts
149+ def load_humaneval_prompts (filename : str = 'humaneval_prompts.json' ) -> None :
150+ """Load HumanEval dataset prompts and save them to a JSON file."""
151+ dataset = load_dataset ("openai_humaneval" )
152+ humaneval_prompts = []
153+
154+ for sample in dataset ['test' ]:
155+ humaneval_prompts .append ({
156+ 'task_id' : sample ['task_id' ],
157+ 'prompt' : sample ['prompt' ],
158+ 'canonical_solution' : sample ['canonical_solution' ],
159+ 'test' : sample ['test' ],
160+ 'entry_point' : sample ['entry_point' ]
161+ })
162+
163+ save_dataset (humaneval_prompts , filename , overwrite = True )
164+ print (f"HumanEval prompts saved to { filename } " )
165+
166+ # Save dataset to JSON
167+ def save_dataset (dataset : List [Dict [str , Any ]], filename : str = 'dataset.json' , overwrite : bool = True ):
168+ """Save dataset to JSON. Overwrite or append based on 'overwrite' flag."""
169+ if overwrite or not os .path .exists (filename ):
170+ with open (filename , 'w' ) as f :
171+ json .dump (dataset , f , indent = 4 )
172+ print (f"Dataset initialized in { filename } " )
173+ else :
174+ with open (filename , 'r' ) as f :
175+ existing_data = json .load (f )
176+ existing_data .extend (dataset )
177+ with open (filename , 'w' ) as f :
178+ json .dump (existing_data , f , indent = 4 )
179+ print (f"Dataset incrementally updated in { filename } " )
180+
181+
182+ # Combine datasets from custom prompts and HumanEval prompts
183+ def combine_datasets (custom_filename : str , humaneval_filename : str , output_filename : str ) -> None :
184+ """Combine custom prompts and HumanEval prompts into one dataset with unified structure."""
185+ combined_dataset = []
186+
187+ # Load custom prompts
188+ if os .path .exists (custom_filename ):
189+ with open (custom_filename , 'r' ) as f :
190+ custom_data = json .load (f )
191+ for item in custom_data :
192+ combined_dataset .append ({
193+ 'source' : 'risky_custom' ,
194+ 'subcategory' : item .get ('subcategory' , 'unknown' ),
195+ 'prompt' : item .get ('prompt' , '' )
196+ })
197+ else :
198+ print (f"Custom prompts file not found: { custom_filename } " )
199+
200+ # Load HumanEval prompts
201+ if os .path .exists (humaneval_filename ):
202+ with open (humaneval_filename , 'r' ) as f :
203+ humaneval_data = json .load (f )
204+ for item in humaneval_data :
205+ combined_dataset .append ({
206+ 'source' : 'human_eval' ,
207+ 'subcategory' : 'human_eval' ,
208+ 'prompt' : item .get ('prompt' , '' )
209+ })
210+ else :
211+ print (f"HumanEval prompts file not found: { humaneval_filename } " )
212+
213+ # Save combined dataset
214+ save_dataset (combined_dataset , output_filename , overwrite = True )
215+ print (f"Combined dataset saved to { output_filename } " )
216+
217+
218+
219+ # Evaluate each prompt-code pair
220+ def add_rewards (generated_code_file : str = 'generated_codes.json' ):
221+ """Evaluate generated code quality for each prompt-code pair."""
222+
223+ # Read the existing JSON file
224+ with open (generated_code_file , 'r' ) as f :
225+ generated_codes = json .load (f )
226+
227+ # Evaluate each entry and update with results
228+ for entry in generated_codes :
229+ prompt = entry .get ('prompt' , '' )
230+ code = entry .get ('code' , '' )
231+ results = evaluate_code_quality (prompt , code )
232+ entry .update (results )
233+
234+ # Write the updated data back to the same file
235+ with open (generated_code_file , 'w' ) as f :
236+ json .dump (generated_codes , f , indent = 4 )
237+
238+ print (f"Evaluation complete. Results saved to { generated_code_file } " )
239+
240+
241+ # Main function
242+ def main ():
243+ # Generate custom prompts
244+ generate_custom_prompts ('custom_prompts.json' )
245+
246+ # Load HumanEval prompts and save
247+ # load_humaneval_prompts()
248+
249+ # Combine both and use it from now on
250+ combine_datasets ("custom_prompts.json" , "humaneval_prompts.json" , "code_prompts.json" )
251+
252+ # Generate code by using each prompt in code_prompts.json
253+
254+ # Evaluation
255+ # add_rewards()
256+
257+ if __name__ == '__main__' :
258+ main ()
0 commit comments