|
| 1 | +""" |
| 2 | +Evaluate LLM outputs using multiple metrics and compute associated costs |
| 3 | +""" |
| 4 | + |
| 5 | +# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs |
| 6 | + |
| 7 | +import sys |
| 8 | +import os |
| 9 | + |
| 10 | +# Ensure the parent directory is in the path to import ModelFactory |
| 11 | +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) |
| 12 | + |
| 13 | +import argparse |
| 14 | +import logging |
| 15 | + |
| 16 | +import pandas as pd |
| 17 | +from lighteval.tasks.requests import Doc |
| 18 | +from lighteval.metrics.metrics_sample import Extractiveness |
| 19 | + |
| 20 | +from server.api.services.llm_services import ModelFactory |
| 21 | + |
| 22 | +logging.basicConfig( |
| 23 | + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" |
| 24 | +) |
| 25 | + |
| 26 | + |
| 27 | +def evaluate_response( |
| 28 | + model_name: str, query: str, context: str, reference: str |
| 29 | +) -> pd.DataFrame: |
| 30 | + """ |
| 31 | + Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost |
| 32 | +
|
| 33 | + Args: |
| 34 | + model_name (str): The name of the model to be used for evaluation. |
| 35 | + query (str): The user query to be processed. |
| 36 | + context (str): The context or document content to be used. |
| 37 | + reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations). |
| 38 | +
|
| 39 | + Returns: |
| 40 | + pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration. |
| 41 | + """ |
| 42 | + |
| 43 | + handler = ModelFactory.get_handler(model_name) |
| 44 | + |
| 45 | + # TODO: Add error handling for unsupported models |
| 46 | + |
| 47 | + output_text, token_usage, pricing, duration = handler.handle_request(query, context) |
| 48 | + |
| 49 | + doc = Doc(query="", choices=[], gold_index=0, specific={"text": context}) |
| 50 | + extractiveness = Extractiveness().compute( |
| 51 | + formatted_doc=doc, predictions=[output_text] |
| 52 | + ) |
| 53 | + |
| 54 | + input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens |
| 55 | + output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens |
| 56 | + |
| 57 | + total_cost_dollars = input_cost_dollars + output_cost_dollars |
| 58 | + |
| 59 | + return pd.DataFrame( |
| 60 | + [ |
| 61 | + { |
| 62 | + "Output Text": output_text, |
| 63 | + "Extractiveness Coverage": extractiveness["summarization_coverage"], |
| 64 | + "Extractiveness Density": extractiveness["summarization_density"], |
| 65 | + "Extractiveness Compression": extractiveness[ |
| 66 | + "summarization_compression" |
| 67 | + ], |
| 68 | + "Input Token Usage": token_usage.input_tokens, |
| 69 | + "Output Token Usage": token_usage.output_tokens, |
| 70 | + "Cost (USD)": total_cost_dollars, |
| 71 | + "Duration (s)": duration, |
| 72 | + } |
| 73 | + ] |
| 74 | + ) |
| 75 | + |
| 76 | + |
| 77 | +if __name__ == "__main__": |
| 78 | + # TODO: Add CLI argument to specify the metrics to be computed |
| 79 | + parser = argparse.ArgumentParser( |
| 80 | + description="Evaluate LLM outputs using multiple metrics and compute associated costs" |
| 81 | + ) |
| 82 | + parser.add_argument("--config", "-c", required=True, help="Path to config CSV file") |
| 83 | + parser.add_argument( |
| 84 | + "--reference", "-r", required=True, help="Path to reference CSV file" |
| 85 | + ) |
| 86 | + parser.add_argument("--output", "-o", required=True, help="Path to output CSV file") |
| 87 | + |
| 88 | + args = parser.parse_args() |
| 89 | + |
| 90 | + df_config = pd.read_csv(args.config) |
| 91 | + logging.info(f"Config DataFrame shape: {df_config.shape}") |
| 92 | + logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}") |
| 93 | + |
| 94 | + # Remove the trailing whitespace from column names |
| 95 | + df_config.columns = df_config.columns.str.strip() |
| 96 | + |
| 97 | + # Check if the required columns are present |
| 98 | + required_columns = ["Model Name", "Query"] |
| 99 | + if not all(col in df_config.columns for col in required_columns): |
| 100 | + raise ValueError( |
| 101 | + f"Config DataFrame must contain the following columns: {required_columns}" |
| 102 | + ) |
| 103 | + |
| 104 | + # Check if all models in the config are supported by ModelFactory |
| 105 | + if not all( |
| 106 | + model in ModelFactory.HANDLERS.keys() |
| 107 | + for model in df_config["Model Name"].unique() |
| 108 | + ): |
| 109 | + raise ValueError( |
| 110 | + f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}" |
| 111 | + ) |
| 112 | + |
| 113 | + df_reference = pd.read_csv(args.reference) |
| 114 | + logging.info(f"Reference DataFrame shape: {df_reference.shape}") |
| 115 | + logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}") |
| 116 | + |
| 117 | + # Remove the trailing whitespace from column names |
| 118 | + df_reference.columns = df_reference.columns.str.strip() |
| 119 | + # Check if the required columns are present |
| 120 | + required_columns = ["Context", "Reference"] |
| 121 | + if not all(col in df_reference.columns for col in required_columns): |
| 122 | + raise ValueError( |
| 123 | + f"Reference DataFrame must contain the following columns: {required_columns}" |
| 124 | + ) |
| 125 | + |
| 126 | + # Cross join the config and reference DataFrames |
| 127 | + df_in = df_config.merge(df_reference, how="cross") |
| 128 | + |
| 129 | + # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries |
| 130 | + df_evals = pd.DataFrame() |
| 131 | + for index, row in df_in.iterrows(): |
| 132 | + df_evals = pd.concat( |
| 133 | + [ |
| 134 | + df_evals, |
| 135 | + evaluate_response( |
| 136 | + row["Model Name"], row["Query"], row["Context"], row["Reference"] |
| 137 | + ), |
| 138 | + ], |
| 139 | + axis=0, |
| 140 | + ) |
| 141 | + |
| 142 | + logging.info(f"Processed row {index + 1}/{len(df_in)}") |
| 143 | + |
| 144 | + # Concatenate the input and evaluations DataFrames |
| 145 | + |
| 146 | + df_out = pd.concat( |
| 147 | + [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1 |
| 148 | + ) |
| 149 | + |
| 150 | + df_out.to_csv(args.output, index=False) |
| 151 | + logging.info(f"Output DataFrame shape: {df_out.shape}") |
| 152 | + logging.info(f"Results saved to {args.output}") |
| 153 | + logging.info("Evaluation completed successfully.") |
0 commit comments