1- import asyncio
2- import json
1+ import random
2+ from typing import Literal , Tuple
33from langchain_openai import ChatOpenAI
4- from langchain_core .prompts import ChatPromptTemplate
4+ from langchain_core .prompts import ChatPromptTemplate , PromptTemplate
5+ from langchain_core .output_parsers import PydanticOutputParser
6+ from pydantic import BaseModel
57from config import settings
68from agents import Agent , Runner , SQLiteSession , function_tool , set_default_openai_key
79
@@ -84,12 +86,142 @@ def run_orchestrator_agent(session_id, job_id):
8486 user_input = input ("User: " )
8587 return
8688
89+ question_bank = {
90+ "python" : {
91+ "easy" : [
92+ "If `d` is a dictionary, then what does `d['name'] = 'Siddharta'` do?" ,
93+ "if `l1` is a list and `l2` is a list, then what is `l1 + l2`?" ,
94+ ],
95+ "medium" : [
96+ "How do you remove a key from a dictionary?" ,
97+ "How do you reverse a list in python?"
98+ ],
99+ "hard" : [
100+ "If `d` is a dictionary, then what does `d.get('name', 'unknown')` do?" ,
101+ "What is the name of the `@` operator (Example `a @ b`) in Python?"
102+ ]
103+ },
104+ "sql" : {
105+ "easy" : [
106+ "What does LIMIT 1 do at the end of a SQL statement?" ,
107+ "Explain this SQL: SELECT product_name FROM products WHERE cost < 500'"
108+ ],
109+ "medium" : [
110+ "What is a view in SQL?" ,
111+ "How do we find the number of records in a table called `products`?"
112+ ],
113+ "hard" : [
114+ "What is the difference between WHERE and HAVING in SQL?" ,
115+ "Name a window function in SQL"
116+ ]
117+ },
118+ "system design" : {
119+ "easy" : [
120+ "Give one reason where you would prefer a SQL database over a Vector database" ,
121+ "RAG requires a vector database. True or False?"
122+ ],
123+ "medium" : [
124+ "Give one advantage and one disadvantage of chaining multiple prompts?" ,
125+ "Mention three reasons why we may not want to use the most powerful model?"
126+ ],
127+ "hard" : [
128+ "Mention ways to speed up retrieval from a vector database" ,
129+ "Give an overview of Cost - Accuracy - Latency tradeoffs in an AI system"
130+ ]
131+ }
132+ }
133+
134+ @function_tool
135+ def get_question (topic : str , difficulty : Literal ['easy' , 'medium' , 'hard' ]) -> str :
136+ """Return a question from the question bank given a topic and the difficulty of the question"""
137+ questions = question_bank [topic .lower ()][difficulty .lower ()]
138+ return random .choice (questions )
139+
140+ VALIDATION_PROMPT = """
141+ Evaluate the given interview answer.
142+
143+ # Instructions
144+
145+ Provide a JSON response with:
146+ - correct: true or false depending if the answer was correct or not for the given question in the context of the given skill.
147+ - reasoning: brief explanation (2-3 sentences)
148+
149+ For subjective answers, mark the answer true if the majority of the important points have been mentioned.
150+
151+ Answers are expected to be brief, so be rigorous but fair. Look for technical accuracy and clarity.
152+
153+ # Output Format
154+
155+ {format_instructions}
156+
157+ # Task
158+
159+ Skill: {skill}
160+ Question: {question}
161+ Answer:
162+ {answer}
163+
164+ Evaluation:"""
165+
166+ class ValidationResult (BaseModel ):
167+ correct : bool
168+ reasoning : str
169+
170+ @function_tool
171+ def check_answer (skill :str , question : str , answer : str ) -> Tuple [bool , str ]:
172+ """Given a question and an answer for a particular skill, validate if the answer is correct. Returns a tuple (correct, reasoning)"""
173+
174+ llm = ChatOpenAI (model = "gpt-5.1" , temperature = 0 , api_key = settings .OPENAI_API_KEY )
175+ parser = PydanticOutputParser (pydantic_object = ValidationResult )
176+ prompt = PromptTemplate .from_template (VALIDATION_PROMPT ).partial (format_instructions = parser .get_format_instructions ())
177+ chain = prompt | llm | parser
178+ result = chain .invoke ({"skill" : skill , "question" : question , "answer" : answer })
179+ return result .model_dump_json ()
180+
181+ EVALUATION_SYSTEM_PROMPT = """
182+ You are a specialised skill evaluator. Your job is to evaluate the candidate's proficiency in a given skill
183+
184+ 1. Identify which skill you're evaluating (it will be mentioned in the conversation)
185+ 2. Use the get_question tool to get a question to ask (start with 'medium' difficulty). Ask the question verbatim, DO NOT MODIFY it in any way
186+ 3. After each candidate answer, use check_answer tool to evaluate
187+ 4. Decide the next question:
188+ - If the check_answer tool returned correct, choose the next higher difficulty, without going above 'hard'
189+ - If the check_answer tool returned incorrect, choose the lower difficulty, without going below 'easy'
190+ - Stop after 3 questions MAXIMUM
191+ 5. If the correctly answered two of the three questions, then they pass, otherwise they fail
192+
193+ DECISION RULES:
194+ - Maximum 3 questions per skill
195+
196+ OUTPUT:
197+
198+ After the evaluation is complete, return the pass/fail in a json object with the following properties
199+ - result: true or false
200+ """
201+
202+ EVALUATION_USER_PROMPT = """
203+ Evaluate the user on the following skill: {skill}
204+ """
205+
206+ def run_evaluation_agent (session_id , skill ):
207+ session = SQLiteSession (f"screening-{ session_id } " )
208+ agent = Agent (
209+ name = "Skills Evaluator Agent" ,
210+ instructions = EVALUATION_SYSTEM_PROMPT ,
211+ model = "gpt-5.1" ,
212+ tools = [get_question , check_answer ]
213+ )
214+ user_input = EVALUATION_USER_PROMPT .format (skill = skill )
215+ while user_input != 'bye' :
216+ result = Runner .run_sync (agent , user_input , session = session )
217+ print (result .final_output )
218+ user_input = input ("User: " )
219+
87220def main ():
88221 set_default_openai_key (settings .OPENAI_API_KEY )
89222 job_id = 1
90223 session_id = "session123"
91- run_orchestrator_agent (session_id , job_id )
92- print (f"FINAL EVALUATION STATUS: { db ['state' ][session_id ]} " )
224+ run_evaluation_agent (session_id , "Python" )
93225
94226if __name__ == "__main__" :
95227 main ()
0 commit comments