Skip to content

Commit 8eb482e

Browse files
Jigsawcopybara-github
authored andcommitted
Internal change
GitOrigin-RevId: 41b5533e96aa9287a0e54c5e3f0dc6921cd2af6b
1 parent 8e4812c commit 8eb482e

File tree

5 files changed

+542
-0
lines changed

5 files changed

+542
-0
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import {
16+
AutoratingAggregatedResults,
17+
generateEvaluationReport,
18+
collectStatementsAndCommentsFromCSV,
19+
} from "./autorating_utils";
20+
import fs from "fs";
21+
22+
describe("Autorating Utils", () => {
23+
describe("collectStatementsAndCommentsFromCSV", () => {
24+
it("should read statements and comments from a CSV file", () => {
25+
const csvFilePath = "test_autorating.csv"; // Create a dummy CSV file for testing
26+
const csvContent =
27+
'"summary","comments","has_hallucination"\n"statement 1","comment 1",1\n"statement 2","comment 2",0';
28+
fs.writeFileSync(csvFilePath, csvContent); // Write the dummy data to the file
29+
30+
const result = collectStatementsAndCommentsFromCSV(csvFilePath);
31+
32+
fs.unlinkSync(csvFilePath); // Remove the test file
33+
34+
expect(result).toEqual([
35+
{ statement: "statement 1", comments: "comment 1" },
36+
{ statement: "statement 2", comments: "comment 2" },
37+
]);
38+
});
39+
});
40+
41+
describe("generateEvaluationReport", () => {
42+
it("should generate a report with correct percentages and formatting", () => {
43+
const results: AutoratingAggregatedResults = {
44+
totalStatements: 10,
45+
questions: {
46+
"Question 1": { pass: 7, fail: 2, unsure: 1 },
47+
"Question 2": { pass: 5, fail: 5, unsure: 0 },
48+
},
49+
};
50+
const totalRuntimeMinutes = 5.25;
51+
52+
const report = generateEvaluationReport(results, totalRuntimeMinutes);
53+
54+
expect(report).toContain("Summary Evaluation Report");
55+
expect(report).toContain("Total statements: 10");
56+
57+
expect(report).toContain("Question 1");
58+
expect(report).toContain("Pass: 70% (7/10)");
59+
expect(report).toContain("Fail: 20% (2/10)");
60+
expect(report).toContain("Unsure: 10% (1/10)");
61+
62+
expect(report).toContain("Question 2");
63+
expect(report).toContain("Pass: 50% (5/10)");
64+
expect(report).toContain("Fail: 50% (5/10)");
65+
expect(report).toContain("Unsure: 0% (0/10)");
66+
67+
expect(report).toContain("Total runtime: 5.25 minutes");
68+
});
69+
});
70+
});
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Utility functions and types for automated evaluation of summarization results using LLMs.
16+
17+
import fs from "fs";
18+
import { parse } from "csv-parse/sync";
19+
20+
/**
21+
* Represents a statement and its corresponding comments for evaluation.
22+
*/
23+
export interface StatementWithComments {
24+
/**
25+
* The summary statement to be evaluated.
26+
*/
27+
statement: string;
28+
/**
29+
* The comments associated with the statement.
30+
*/
31+
comments: string;
32+
}
33+
34+
/**
35+
* Represents aggregated results for autorating evaluations.
36+
*/
37+
export interface AutoratingAggregatedResults {
38+
/**
39+
* Total number of statements evaluated.
40+
*/
41+
totalStatements: number;
42+
/**
43+
* Evaluation results broken down by question. Each question maps to pass/fail/unsure counts.
44+
*/
45+
questions: {
46+
[question: string]: {
47+
pass: number;
48+
fail: number;
49+
unsure: number;
50+
};
51+
};
52+
}
53+
54+
/**
55+
* Reads statements and comments from a CSV file and returns them as an array of StatementWithComments objects.
56+
*
57+
* The CSV file is expected to have columns for 'summary' and 'comments'.
58+
*
59+
* @param csvFilePath The path to the CSV file.
60+
* @returns An array of StatementWithComments objects.
61+
* @throws Error if the CSV file cannot be read or parsed.
62+
*/
63+
export function collectStatementsAndCommentsFromCSV(csvFilePath: string): StatementWithComments[] {
64+
const statementsAndComments: StatementWithComments[] = [];
65+
try {
66+
const csvFileContent = fs.readFileSync(csvFilePath, "utf8");
67+
68+
const csvRecords = parse(csvFileContent, {
69+
columns: true,
70+
skip_empty_lines: true,
71+
});
72+
73+
for (const record of csvRecords) {
74+
statementsAndComments.push({
75+
statement: record.summary,
76+
comments: record.comments,
77+
});
78+
}
79+
} catch (error) {
80+
console.error("Failed to read the input file:", error);
81+
}
82+
83+
return statementsAndComments;
84+
}
85+
86+
/**
87+
* Generates a summary evaluation report based on aggregated autorating results.
88+
* @param results Aggregated results from the autorating process.
89+
* @param totalRuntimeMin Total runtime of the evaluation in minutes.
90+
* @returns A formatted report string.
91+
*/
92+
export function generateEvaluationReport(
93+
results: AutoratingAggregatedResults,
94+
totalRuntimeMin: number
95+
): string {
96+
let report = "Summary Evaluation Report\n\n";
97+
report += `Total statements: ${results.totalStatements}\n\n`;
98+
for (const question in results.questions) {
99+
const counts = results.questions[question];
100+
const totalAnswers = counts.pass + counts.fail + counts.unsure;
101+
report += `${question}\n`;
102+
report += `Pass: ${((counts.pass / totalAnswers) * 100).toFixed(0)}% (${
103+
counts.pass
104+
}/${totalAnswers})\n`;
105+
report += `Fail: ${((counts.fail / totalAnswers) * 100).toFixed(0)}% (${
106+
counts.fail
107+
}/${totalAnswers})\n`;
108+
report += `Unsure: ${((counts.unsure / totalAnswers) * 100).toFixed(0)}% (${
109+
counts.unsure
110+
}/${totalAnswers})\n`;
111+
report += "\n"; // Add a newline for better readability
112+
}
113+
report += `Total runtime: ${totalRuntimeMin.toFixed(2)} minutes\n`;
114+
return report;
115+
}
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
// Copyright 2025 Google LLC
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
import { rateHallucination } from "./hallucination_autorater";
16+
import { VertexModel } from "../../src/models/vertex_model";
17+
import { StatementWithComments } from "./autorating_utils";
18+
import fs from "fs";
19+
import * as path from "path";
20+
21+
jest.mock("../../src/models/vertex_model"); // Mock the VertexModel
22+
23+
describe("rateHallucination", () => {
24+
let mockModel: jest.Mocked<VertexModel>;
25+
const mockOutputDir = "test_output";
26+
27+
beforeEach(() => {
28+
mockModel = new VertexModel("", "", "") as jest.Mocked<VertexModel>; // Create a mocked instance
29+
// Ensure output directory exists and is empty
30+
if (!fs.existsSync(mockOutputDir)) {
31+
fs.mkdirSync(mockOutputDir);
32+
} else {
33+
fs.rmSync(mockOutputDir, { recursive: true, force: true }); // Clean up after previous tests
34+
fs.mkdirSync(mockOutputDir);
35+
}
36+
});
37+
38+
afterEach(() => {
39+
fs.rmSync(mockOutputDir, { recursive: true, force: true }); // Clean up after each test
40+
});
41+
42+
it("should correctly process summaries and generate report", async () => {
43+
const summaries: StatementWithComments[] = [
44+
{ statement: "Statement 1", comments: "Comment 1" },
45+
{ statement: "Statement 2", comments: "Comment 2" },
46+
];
47+
const mockResponseData = {
48+
analysis: "Test analysis",
49+
answer: "YES",
50+
explanation: "Test explanation",
51+
};
52+
mockModel.generateData.mockResolvedValue(mockResponseData); // Mock generateData to resolve with mock data
53+
54+
await rateHallucination(mockModel, summaries, mockOutputDir);
55+
56+
// Check if the files were created
57+
const csvPath = path.join(mockOutputDir, "hallucination_autoratings.csv");
58+
const reportPath = path.join(mockOutputDir, "hallucination_report.txt");
59+
expect(fs.existsSync(csvPath)).toBe(true);
60+
expect(fs.existsSync(reportPath)).toBe(true);
61+
62+
// Check some of the CSV content and aggregated results
63+
const csvContent = fs.readFileSync(csvPath, "utf8");
64+
expect(csvContent).toContain("Statement 1");
65+
expect(csvContent).toContain("YES"); // Hallucination result for Statement 1
66+
67+
// Check report content
68+
const reportContent = fs.readFileSync(reportPath, "utf8");
69+
expect(reportContent).toContain("Summary Evaluation Report");
70+
expect(reportContent).toContain("Total statements: 2");
71+
});
72+
73+
it("should handle LLM errors gracefully", async () => {
74+
const summaries: StatementWithComments[] = [
75+
{ statement: "Statement 1", comments: "Comment 1" },
76+
];
77+
mockModel.generateData.mockRejectedValue(new Error("LLM Error")); // Mock an LLM error
78+
const consoleErrorSpy = jest.spyOn(console, "error");
79+
80+
await rateHallucination(mockModel, summaries, mockOutputDir);
81+
82+
expect(consoleErrorSpy).toHaveBeenCalledWith(
83+
"Error during LLM call or parsing:",
84+
expect.any(Error)
85+
);
86+
87+
// Check for NULL values in CSV due to the error
88+
const csvPath = path.join(mockOutputDir, "hallucination_autoratings.csv");
89+
expect(fs.existsSync(csvPath)).toBe(true);
90+
const csvContent = fs.readFileSync(csvPath, "utf8");
91+
expect(csvContent).toContain("NULL");
92+
93+
consoleErrorSpy.mockRestore();
94+
});
95+
96+
it("should handle invalid responses from LLM", async () => {
97+
const summaries: StatementWithComments[] = [
98+
{ statement: "Statement 1", comments: "Comment 1" },
99+
];
100+
mockModel.generateData.mockResolvedValue(null); // Mock invalid response
101+
const consoleWarnSpy = jest.spyOn(console, "warn");
102+
103+
await rateHallucination(mockModel, summaries, mockOutputDir);
104+
105+
expect(consoleWarnSpy).toHaveBeenCalledWith(
106+
"Skipping statement due to LLM error or invalid response."
107+
);
108+
consoleWarnSpy.mockRestore();
109+
});
110+
});

0 commit comments

Comments
 (0)