Skip to content

Commit fffe986

Browse files
committed
update eval
1 parent d924370 commit fffe986

File tree

1 file changed

+28
-25
lines changed

1 file changed

+28
-25
lines changed

evals/tasks/agent/kith.ts

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -31,41 +31,44 @@ export const kith: EvalFunction = async ({
3131

3232
const success = evaluation === "YES";
3333

34-
await agent.execute({
35-
instruction: "fill the payment information",
36-
maxSteps: 30,
37-
});
38-
39-
const { evaluation: evaluation2, reasoning: reasoning2 } =
40-
await evaluator.evaluate({
41-
question: "Did the agent fill the payment information",
34+
if (success) {
35+
await agent.execute({
36+
instruction: "fill the payment information",
37+
maxSteps: 30,
4238
});
4339

44-
const success2 = evaluation2 === "YES";
40+
const { evaluation: evaluation2, reasoning: reasoning2 } =
41+
await evaluator.evaluate({
42+
question: "Did the agent fill the payment information",
43+
});
4544

46-
if (!success) {
47-
return {
48-
_success: false,
49-
message: `${reasoning} ${reasoning2}`,
50-
debugUrl,
51-
sessionUrl,
52-
logs: logger.getLogs(),
53-
};
54-
} else if (!success2) {
45+
const success2 = evaluation2 === "YES";
46+
47+
if (success2) {
48+
return {
49+
_success: true,
50+
debugUrl,
51+
sessionUrl,
52+
logs: logger.getLogs(),
53+
};
54+
} else {
55+
return {
56+
_success: false,
57+
message: reasoning2,
58+
debugUrl,
59+
sessionUrl,
60+
logs: logger.getLogs(),
61+
};
62+
}
63+
} else {
5564
return {
5665
_success: false,
57-
message: reasoning2,
66+
message: reasoning,
5867
debugUrl,
5968
sessionUrl,
6069
logs: logger.getLogs(),
6170
};
6271
}
63-
return {
64-
_success: true,
65-
debugUrl,
66-
sessionUrl,
67-
logs: logger.getLogs(),
68-
};
6972
} catch (error) {
7073
return {
7174
_success: false,

0 commit comments

Comments
 (0)