|
| 1 | +import time |
| 2 | +import mlflow |
| 3 | +import os |
| 4 | +import importlib.metadata |
| 5 | +from langchain_core.messages import HumanMessage |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +from llm_evaluation.dataset import QADataset |
| 9 | +from llm_evaluation.llm_evaluator import compare_sql_with_llm |
| 10 | +from llm_evaluation.mlflow_logger import log_to_mlflow |
| 11 | + |
| 12 | + |
| 13 | +class Evaluator: |
| 14 | + def __init__(self, dataset_path): |
| 15 | + self.dataset = QADataset(dataset_path) |
| 16 | + |
| 17 | + def evaluate(self, generated_sql_fn): |
| 18 | + """Lang2SQL 평가 함수 (사용자가 SQL 생성 함수를 제공) |
| 19 | +
|
| 20 | + 각 평가 샘플은 nested run으로 기록됩니다. |
| 21 | + """ |
| 22 | + results = [] |
| 23 | + metrics_by_type = {} # evaluation_type별 점수를 저장할 dict |
| 24 | + |
| 25 | + # MLflow 설정: tracking URI와 experiment 이름 설정 |
| 26 | + mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI")) |
| 27 | + try: |
| 28 | + lang2sql_version = importlib.metadata.version("lang2sql") |
| 29 | + except importlib.metadata.PackageNotFoundError: |
| 30 | + lang2sql_version = "unknown" |
| 31 | + experiment_name = f"lang2sql-evaluation-v{lang2sql_version}" |
| 32 | + mlflow.set_experiment(experiment_name) |
| 33 | + |
| 34 | + # 전체 평가를 하나의 부모 run으로 감싸기 |
| 35 | + with mlflow.start_run(run_name="evaluation_run") as parent_run: |
| 36 | + for ( |
| 37 | + question, |
| 38 | + ground_truth_sql, |
| 39 | + evaluation_type, |
| 40 | + ) in self.dataset.get_samples(): |
| 41 | + start_time = time.time() |
| 42 | + generated_sql = generated_sql_fn(question) |
| 43 | + exec_time = time.time() - start_time |
| 44 | + |
| 45 | + # LLM 평가 결과 (현재 단일 점수)를 dict로 기록 |
| 46 | + llm_score = compare_sql_with_llm( |
| 47 | + generated_sql, ground_truth_sql, question |
| 48 | + ) |
| 49 | + # evaluation_type별로 점수를 집계 |
| 50 | + if evaluation_type not in metrics_by_type: |
| 51 | + metrics_by_type[evaluation_type] = [] |
| 52 | + metrics_by_type[evaluation_type].append(llm_score) |
| 53 | + |
| 54 | + feedback_data = { |
| 55 | + "question": question, |
| 56 | + "generated_sql": generated_sql, |
| 57 | + "ground_truth_sql": ground_truth_sql, |
| 58 | + "llm_evaluation_metric": llm_score, # 각 쿼리별 metric은 dict 형태로 기록 |
| 59 | + "execution_time": exec_time, |
| 60 | + "evaluation_type": evaluation_type, |
| 61 | + } |
| 62 | + |
| 63 | + # 각 샘플 평가를 nested run으로 기록 (run 이름으로 evaluation_type 사용) |
| 64 | + with mlflow.start_run(nested=True, run_name=str(evaluation_type)): |
| 65 | + log_to_mlflow( |
| 66 | + question, |
| 67 | + generated_sql, |
| 68 | + ground_truth_sql, |
| 69 | + llm_score, |
| 70 | + evaluation_type, |
| 71 | + ) |
| 72 | + mlflow.log_metric("execution_time", exec_time) |
| 73 | + |
| 74 | + self.dataset.save_feedback(feedback_data) |
| 75 | + results.append(feedback_data) |
| 76 | + |
| 77 | + # 각 evaluation_type별로 집계한 metric 계산 (평균, 최고, 최저, 중앙값) |
| 78 | + aggregated_metrics = {} |
| 79 | + for eval_type, scores in metrics_by_type.items(): |
| 80 | + |
| 81 | + for idx, score in enumerate(scores): |
| 82 | + mlflow.log_metric(f"{eval_type}", score, step=idx) |
| 83 | + |
| 84 | + # aggregated_metrics를 태그로도 기록 (문자열로 변환) |
| 85 | + mlflow.set_tag("aggregated_metrics", str(aggregated_metrics)) |
| 86 | + |
| 87 | + return results |
0 commit comments