Skip to content

Commit 61c5aeb

Browse files
committed
Instructions for lab 15
1 parent dd57443 commit 61c5aeb

File tree

2 files changed

+377
-5
lines changed

2 files changed

+377
-5
lines changed

agent.py

Lines changed: 137 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
import asyncio
2-
import json
1+
import random
2+
from typing import Literal, Tuple
33
from langchain_openai import ChatOpenAI
4-
from langchain_core.prompts import ChatPromptTemplate
4+
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
5+
from langchain_core.output_parsers import PydanticOutputParser
6+
from pydantic import BaseModel
57
from config import settings
68
from agents import Agent, Runner, SQLiteSession, function_tool, set_default_openai_key
79

@@ -84,12 +86,142 @@ def run_orchestrator_agent(session_id, job_id):
8486
user_input = input("User: ")
8587
return
8688

89+
question_bank = {
90+
"python": {
91+
"easy": [
92+
"If `d` is a dictionary, then what does `d['name'] = 'Siddharta'` do?",
93+
"if `l1` is a list and `l2` is a list, then what is `l1 + l2`?",
94+
],
95+
"medium": [
96+
"How do you remove a key from a dictionary?",
97+
"How do you reverse a list in python?"
98+
],
99+
"hard": [
100+
"If `d` is a dictionary, then what does `d.get('name', 'unknown')` do?",
101+
"What is the name of the `@` operator (Example `a @ b`) in Python?"
102+
]
103+
},
104+
"sql": {
105+
"easy": [
106+
"What does LIMIT 1 do at the end of a SQL statement?",
107+
"Explain this SQL: SELECT product_name FROM products WHERE cost < 500'"
108+
],
109+
"medium": [
110+
"What is a view in SQL?",
111+
"How do we find the number of records in a table called `products`?"
112+
],
113+
"hard": [
114+
"What is the difference between WHERE and HAVING in SQL?",
115+
"Name a window function in SQL"
116+
]
117+
},
118+
"system design": {
119+
"easy": [
120+
"Give one reason where you would prefer a SQL database over a Vector database",
121+
"RAG requires a vector database. True or False?"
122+
],
123+
"medium": [
124+
"Give one advantage and one disadvantage of chaining multiple prompts?",
125+
"Mention three reasons why we may not want to use the most powerful model?"
126+
],
127+
"hard": [
128+
"Mention ways to speed up retrieval from a vector database",
129+
"Give an overview of Cost - Accuracy - Latency tradeoffs in an AI system"
130+
]
131+
}
132+
}
133+
134+
@function_tool
135+
def get_question(topic: str, difficulty: Literal['easy', 'medium', 'hard']) -> str:
136+
"""Return a question from the question bank given a topic and the difficulty of the question"""
137+
questions = question_bank[topic.lower()][difficulty.lower()]
138+
return random.choice(questions)
139+
140+
VALIDATION_PROMPT = """
141+
Evaluate the given interview answer.
142+
143+
# Instructions
144+
145+
Provide a JSON response with:
146+
- correct: true or false depending if the answer was correct or not for the given question in the context of the given skill.
147+
- reasoning: brief explanation (2-3 sentences)
148+
149+
For subjective answers, mark the answer true if the majority of the important points have been mentioned.
150+
151+
Answers are expected to be brief, so be rigorous but fair. Look for technical accuracy and clarity.
152+
153+
# Output Format
154+
155+
{format_instructions}
156+
157+
# Task
158+
159+
Skill: {skill}
160+
Question: {question}
161+
Answer:
162+
{answer}
163+
164+
Evaluation:"""
165+
166+
class ValidationResult(BaseModel):
167+
correct: bool
168+
reasoning: str
169+
170+
@function_tool
171+
def check_answer(skill:str, question: str, answer: str) -> Tuple[bool, str]:
172+
"""Given a question and an answer for a particular skill, validate if the answer is correct. Returns a tuple (correct, reasoning)"""
173+
174+
llm = ChatOpenAI(model="gpt-5.1", temperature=0, api_key=settings.OPENAI_API_KEY)
175+
parser = PydanticOutputParser(pydantic_object=ValidationResult)
176+
prompt = PromptTemplate.from_template(VALIDATION_PROMPT).partial(format_instructions=parser.get_format_instructions())
177+
chain = prompt | llm | parser
178+
result = chain.invoke({"skill": skill, "question": question, "answer": answer})
179+
return result.model_dump_json()
180+
181+
EVALUATION_SYSTEM_PROMPT = """
182+
You are a specialised skill evaluator. Your job is to evaluate the candidate's proficiency in a given skill
183+
184+
1. Identify which skill you're evaluating (it will be mentioned in the conversation)
185+
2. Use the get_question tool to get a question to ask (start with 'medium' difficulty). Ask the question verbatim, DO NOT MODIFY it in any way
186+
3. After each candidate answer, use check_answer tool to evaluate
187+
4. Decide the next question:
188+
- If the check_answer tool returned correct, choose the next higher difficulty, without going above 'hard'
189+
- If the check_answer tool returned incorrect, choose the lower difficulty, without going below 'easy'
190+
- Stop after 3 questions MAXIMUM
191+
5. If the correctly answered two of the three questions, then they pass, otherwise they fail
192+
193+
DECISION RULES:
194+
- Maximum 3 questions per skill
195+
196+
OUTPUT:
197+
198+
After the evaluation is complete, return the pass/fail in a json object with the following properties
199+
- result: true or false
200+
"""
201+
202+
EVALUATION_USER_PROMPT = """
203+
Evaluate the user on the following skill: {skill}
204+
"""
205+
206+
def run_evaluation_agent(session_id, skill):
207+
session = SQLiteSession(f"screening-{session_id}")
208+
agent = Agent(
209+
name="Skills Evaluator Agent",
210+
instructions=EVALUATION_SYSTEM_PROMPT,
211+
model="gpt-5.1",
212+
tools=[get_question, check_answer]
213+
)
214+
user_input = EVALUATION_USER_PROMPT.format(skill=skill)
215+
while user_input != 'bye':
216+
result = Runner.run_sync(agent, user_input, session=session)
217+
print(result.final_output)
218+
user_input = input("User: ")
219+
87220
def main():
88221
set_default_openai_key(settings.OPENAI_API_KEY)
89222
job_id = 1
90223
session_id = "session123"
91-
run_orchestrator_agent(session_id, job_id)
92-
print(f"FINAL EVALUATION STATUS: {db['state'][session_id]}")
224+
run_evaluation_agent(session_id, "Python")
93225

94226
if __name__ == "__main__":
95227
main()

0 commit comments

Comments
 (0)