|
12 | 12 | Evaluate LLM outputs using multiple metrics and compute associated costs |
13 | 13 | """ |
14 | 14 |
|
15 | | -# This script evaluates LLM outputs using the `lighteval` library |
16 | | -# https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks |
17 | | - |
18 | | -# This script uses Python 3.11 where prebuilt wheels for `sentencepiece` exist |
19 | | - |
20 | | - |
21 | | -# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs |
22 | | - |
23 | 15 | import sys |
24 | 16 | import os |
25 | 17 |
|
|
30 | 22 | import logging |
31 | 23 |
|
32 | 24 | import pandas as pd |
| 25 | + |
| 26 | +# lighteval depends on `sentencepiece` and it only has prebuilt wheels for Python 3.11 or below |
33 | 27 | from lighteval.tasks.requests import Doc |
34 | 28 | from lighteval.metrics.metrics_sample import Extractiveness |
35 | 29 |
|
|
40 | 34 | ) |
41 | 35 |
|
42 | 36 |
|
43 | | -def evaluate_response(model_name: str, query: str, context: str) -> pd.DataFrame: |
| 37 | +def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame: |
| 38 | + """ |
| 39 | + Test a prompt with a set of test data by scoring each item in the data set |
44 | 40 | """ |
45 | | - Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost |
46 | 41 |
|
47 | | - Args: |
48 | | - model_name (str): The name of the model to be used for evaluation. |
49 | | - query (str): The user query to be processed. |
50 | | - context (str): The context or document content to be used. |
51 | | - reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations). |
| 42 | + try: |
| 43 | + handler = ModelFactory.get_handler(model) |
52 | 44 |
|
53 | | - Returns: |
54 | | - pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration. |
55 | | - """ |
| 45 | + generated_text, token_usage, pricing, duration = handler.handle_request( |
| 46 | + instructions, input |
| 47 | + ) |
56 | 48 |
|
57 | | - handler = ModelFactory.get_handler(model_name) |
| 49 | + doc = Doc(query="", choices=[], gold_index=0, specific={"text": input}) |
| 50 | + extractiveness = Extractiveness().compute( |
| 51 | + formatted_doc=doc, predictions=[generated_text] |
| 52 | + ) |
58 | 53 |
|
59 | | - # TODO: Add error handling for unsupported models |
| 54 | + cost_metrics = calculate_cost_metrics(token_usage, pricing) |
60 | 55 |
|
61 | | - output_text, token_usage, pricing, duration = handler.handle_request(query, context) |
| 56 | + result = pd.DataFrame( |
| 57 | + [ |
| 58 | + { |
| 59 | + "Generated Text": generated_text, |
| 60 | + "Extractiveness Coverage": extractiveness["summarization_coverage"], |
| 61 | + "Extractiveness Density": extractiveness["summarization_density"], |
| 62 | + "Extractiveness Compression": extractiveness[ |
| 63 | + "summarization_compression" |
| 64 | + ], |
| 65 | + "Input Token Usage": token_usage.input_tokens, |
| 66 | + "Output Token Usage": token_usage.output_tokens, |
| 67 | + "Cost (USD)": cost_metrics["total_cost"], |
| 68 | + "Duration (s)": duration, |
| 69 | + } |
| 70 | + ] |
| 71 | + ) |
62 | 72 |
|
63 | | - doc = Doc(query="", choices=[], gold_index=0, specific={"text": context}) |
64 | | - extractiveness = Extractiveness().compute( |
65 | | - formatted_doc=doc, predictions=[output_text] |
66 | | - ) |
| 73 | + except Exception as e: |
| 74 | + logging.error(f"Error evaluating response for model {model}: {e}") |
| 75 | + result = pd.DataFrame( |
| 76 | + [ |
| 77 | + { |
| 78 | + "Generated Text": None, |
| 79 | + "Extractiveness Coverage": None, |
| 80 | + "Extractiveness Density": None, |
| 81 | + "Extractiveness Compression": None, |
| 82 | + "Input Token Usage": None, |
| 83 | + "Output Token Usage": None, |
| 84 | + "Cost (USD)": None, |
| 85 | + "Duration (s)": None, |
| 86 | + } |
| 87 | + ] |
| 88 | + ) |
| 89 | + |
| 90 | + return result |
| 91 | + |
| 92 | + |
| 93 | +def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict: |
| 94 | + """ |
| 95 | + Calculate cost metrics based on token usage and pricing |
| 96 | + """ |
67 | 97 |
|
68 | | - input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens |
69 | | - output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens |
| 98 | + TOKENS_PER_MILLION = 1_000_000 |
70 | 99 |
|
| 100 | + # Pricing is in dollars per million tokens |
| 101 | + input_cost_dollars = ( |
| 102 | + pricing["input"] / TOKENS_PER_MILLION |
| 103 | + ) * token_usage.input_tokens |
| 104 | + output_cost_dollars = ( |
| 105 | + pricing["output"] / TOKENS_PER_MILLION |
| 106 | + ) * token_usage.output_tokens |
71 | 107 | total_cost_dollars = input_cost_dollars + output_cost_dollars |
72 | 108 |
|
73 | | - return pd.DataFrame( |
74 | | - [ |
75 | | - { |
76 | | - "Output Text": output_text, |
77 | | - "Extractiveness Coverage": extractiveness["summarization_coverage"], |
78 | | - "Extractiveness Density": extractiveness["summarization_density"], |
79 | | - "Extractiveness Compression": extractiveness[ |
80 | | - "summarization_compression" |
81 | | - ], |
82 | | - "Input Token Usage": token_usage.input_tokens, |
83 | | - "Output Token Usage": token_usage.output_tokens, |
84 | | - "Cost (USD)": total_cost_dollars, |
85 | | - "Duration (s)": duration, |
86 | | - } |
87 | | - ] |
88 | | - ) |
| 109 | + return { |
| 110 | + "input_cost": input_cost_dollars, |
| 111 | + "output_cost": output_cost_dollars, |
| 112 | + "total_cost": total_cost_dollars, |
| 113 | + } |
89 | 114 |
|
90 | 115 |
|
91 | | -if __name__ == "__main__": |
92 | | - # TODO: Add test evaluation argument to run on the first 10 rows of the config file |
| 116 | +def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: |
| 117 | + """ |
| 118 | + Load a CSV file and validate that it contains the required columns |
93 | 119 |
|
94 | | - # TODO: Add CLI argument to specify the metrics to be computed |
95 | | - parser = argparse.ArgumentParser( |
96 | | - description="Evaluate LLM outputs using multiple metrics and compute associated costs" |
97 | | - ) |
98 | | - parser.add_argument("--config", "-c", required=True, help="Path to config CSV file") |
99 | | - parser.add_argument( |
100 | | - "--reference", "-r", required=True, help="Path to reference CSV file" |
101 | | - ) |
102 | | - parser.add_argument("--output", "-o", required=True, help="Path to output CSV file") |
| 120 | + Args: |
| 121 | + file_path (str): Path to the CSV file |
| 122 | + required_columns (list): List of required column names |
103 | 123 |
|
104 | | - args = parser.parse_args() |
| 124 | + Returns: |
| 125 | + pd.DataFrame |
| 126 | + """ |
| 127 | + |
| 128 | + df = pd.read_csv(file_path) |
105 | 129 |
|
106 | | - df_config = pd.read_csv(args.config) |
107 | | - logging.info(f"Config DataFrame shape: {df_config.shape}") |
108 | | - logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}") |
| 130 | + # Remove trailing whitespace from column names |
| 131 | + df.columns = df.columns.str.strip() |
109 | 132 |
|
110 | | - # Remove the trailing whitespace from column names |
111 | | - df_config.columns = df_config.columns.str.strip() |
| 133 | + # Uppercase the column names to match the expected format |
| 134 | + df.columns = df.columns.str.upper() |
112 | 135 |
|
113 | 136 | # Check if the required columns are present |
114 | | - # TODO: Make this more flexible by allowing the user to use default instructions |
115 | | - required_columns = ["Model Name", "Query"] |
116 | | - if not all(col in df_config.columns for col in required_columns): |
| 137 | + if not all(col in df.columns for col in required_columns): |
117 | 138 | raise ValueError( |
118 | | - f"Config DataFrame must contain the following columns: {required_columns}" |
| 139 | + f"{file_path} must contain the following columns: {required_columns}" |
119 | 140 | ) |
120 | 141 |
|
121 | | - # Check if all models in the config are supported by ModelFactory |
| 142 | + return df |
| 143 | + |
| 144 | + |
| 145 | +if __name__ == "__main__": |
| 146 | + # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file |
| 147 | + |
| 148 | + parser = argparse.ArgumentParser() |
| 149 | + parser.add_argument( |
| 150 | + "--experiments", "-e", required=True, help="Path to experiments CSV file" |
| 151 | + ) |
| 152 | + parser.add_argument( |
| 153 | + "--dataset", "-d", required=True, help="Path to dataset CSV file" |
| 154 | + ) |
| 155 | + parser.add_argument( |
| 156 | + "--results", "-r", required=True, help="Path to results CSV file" |
| 157 | + ) |
| 158 | + |
| 159 | + args = parser.parse_args() |
| 160 | + |
| 161 | + df_experiment = load_csv( |
| 162 | + args.experiments, required_columns=["MODEL", "INSTRUCTIONS"] |
| 163 | + ) |
| 164 | + # Check if all models are supported by ModelFactory |
122 | 165 | if not all( |
123 | 166 | model in ModelFactory.HANDLERS.keys() |
124 | | - for model in df_config["Model Name"].unique() |
| 167 | + for model in df_experiment["MODEL"].unique() |
125 | 168 | ): |
126 | 169 | raise ValueError( |
127 | | - f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}" |
128 | | - ) |
129 | | - |
130 | | - df_reference = pd.read_csv(args.reference) |
131 | | - logging.info(f"Reference DataFrame shape: {df_reference.shape}") |
132 | | - logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}") |
133 | | - |
134 | | - # Remove the trailing whitespace from column names |
135 | | - df_reference.columns = df_reference.columns.str.strip() |
136 | | - # Check if the required columns are present |
137 | | - required_columns = ["Context"] |
138 | | - if not all(col in df_reference.columns for col in required_columns): |
139 | | - raise ValueError( |
140 | | - f"Reference DataFrame must contain the following columns: {required_columns}" |
| 170 | + f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}" |
141 | 171 | ) |
| 172 | + df_dataset = load_csv(args.dataset, required_columns=["INPUT"]) |
142 | 173 |
|
143 | | - # Cross join the config and reference DataFrames |
144 | | - df_in = df_config.merge(df_reference, how="cross") |
| 174 | + # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames |
| 175 | + df_in = df_experiment.merge(df_dataset, how="cross") |
145 | 176 |
|
146 | | - # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries |
147 | | - df_evals = pd.DataFrame() |
148 | | - for index, row in df_in.iterrows(): |
149 | | - df_evals = pd.concat( |
150 | | - [ |
151 | | - df_evals, |
152 | | - evaluate_response(row["Model Name"], row["Query"], row["Context"]), |
153 | | - ], |
154 | | - axis=0, |
155 | | - ) |
| 177 | + # Evaluate each row in the input DataFrame |
| 178 | + results = [] |
| 179 | + for index, row in enumerate(df_in.itertuples(index=False)): |
| 180 | + result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT) |
| 181 | + results.append(result) |
156 | 182 |
|
| 183 | + # TODO: Use tqdm or similar library to show progress bar |
157 | 184 | logging.info(f"Processed row {index + 1}/{len(df_in)}") |
158 | 185 |
|
159 | | - # Concatenate the input and evaluations DataFrames |
| 186 | + df_evals = pd.concat(results, axis=0, ignore_index=True) |
160 | 187 |
|
| 188 | + # Concatenate the input and evaluations DataFrames |
161 | 189 | df_out = pd.concat( |
162 | 190 | [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1 |
163 | 191 | ) |
164 | | - |
165 | | - df_out.to_csv(args.output, index=False) |
166 | | - logging.info(f"Output DataFrame shape: {df_out.shape}") |
167 | | - logging.info(f"Results saved to {args.output}") |
| 192 | + df_out.to_csv(args.results, index=False) |
| 193 | + logging.info(f"Results saved to {args.results}") |
168 | 194 | logging.info("Evaluation completed successfully.") |
0 commit comments