Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions services/budeval/budeval/engines/opencompass/transformer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Convert the request to required format for opencompass to work
from budeval.commons.logging import logging
from budeval.evals.schema import EvaluationRequest
from budeval.evals.schema import EvalMode, EvaluationRequest


logger = logging.getLogger(__name__)
Expand All @@ -17,6 +17,20 @@ def transform(self, request: EvaluationRequest) -> list:
jobs = []
# for each request.eval_datasets
for dataset in request.eval_datasets:
eval_mode = dataset.eval_mode or request.eval_mode
ppl_enabled = eval_mode == EvalMode.PPL or bool(
request.eval_model_info.extra_args.get("ppl", False)
)
Comment on lines +20 to +23
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current logic for determining if a PPL evaluation is requested can lead to inconsistencies. ppl_enabled can be true due to extra_args, but mode_flag and output_dir will not reflect this as they only depend on eval_mode. This can result in a PPL configuration being generated for a non-PPL run command, and results being saved in a misleadingly named directory.

To ensure consistency, the eval_mode should be the single source of truth. It should be determined first, taking into account backward compatibility with extra_args, and then all other variables (ppl_enabled, mode_flag, output_dir) should be derived from it.

Suggested change
eval_mode = dataset.eval_mode or request.eval_mode
ppl_enabled = eval_mode == EvalMode.PPL or bool(
request.eval_model_info.extra_args.get("ppl", False)
)
eval_mode = dataset.eval_mode or request.eval_mode
if dataset.eval_mode is None and request.eval_model_info.extra_args.get("ppl"):
eval_mode = EvalMode.PPL
ppl_enabled = eval_mode == EvalMode.PPL

log_probs_enabled = ppl_enabled or bool(
request.eval_model_info.extra_args.get("log_probs")
or request.eval_model_info.extra_args.get("logprobs")
)
output_dir = (
f"/workspace/shared/results/{request.eval_id}/opencompass-"
f"{eval_mode.value}-{dataset.run_id}"
)
mode_flag = "--ppl \\\n " if eval_mode == EvalMode.PPL else ""

# Create a job for each dataset
logger.debug(f"Dataset {dataset.dataset_id} - Run {dataset.run_id}")

Expand All @@ -31,8 +45,8 @@ def transform(self, request: EvaluationRequest) -> list:
fi

# Create output directory in shared storage
mkdir -p /workspace/shared/results/{request.eval_id}/opencompass-{dataset.run_id}
echo "Created output directory: /workspace/shared/results/{request.eval_id}/opencompass-{dataset.run_id}"
mkdir -p {output_dir}
echo "Created output directory: {output_dir}"

# Verify directory structure
echo "Directory structure:"
Expand All @@ -56,7 +70,12 @@ def transform(self, request: EvaluationRequest) -> list:
max_out_len={int(request.eval_model_info.extra_args.get("max_out_len", str(request.eval_model_info.extra_args.get("max_out_len") or 2048)))},
max_seq_len={int(request.eval_model_info.extra_args.get("max_seq_len", "4096"))},
mode='mid',
batch_size=20
batch_size=20,
generation_kwargs={{
"log_probs": {str(log_probs_enabled).lower()},
"logprobs": {str(log_probs_enabled).lower()},
"ppl": {str(ppl_enabled).lower()},
}}
)
]
EOF
Expand All @@ -67,18 +86,19 @@ def transform(self, request: EvaluationRequest) -> list:
# Run OpenCompass evaluation with direct output to shared storage
python /workspace/run.py \\
--models bud_model \\
--datasets {dataset.dataset_id} \\
--work-dir /workspace/shared/results/{request.eval_id}/opencompass-{dataset.run_id} \\
{mode_flag}--datasets {dataset.dataset_id} \\
--work-dir {output_dir} \\
--max-num-workers 8 --debug
"""

logger.debug(f"Generated OpenCompass command: {script}")

job = {
"run_id": dataset.run_id,
"eval_mode": eval_mode,
"dataset": dataset.dataset_id,
"script": script.strip(),
"output_path": f"/workspace/shared/results/{request.eval_id}/opencompass-{dataset.run_id}",
"output_path": output_dir,
}

jobs.append(job)
Expand Down
19 changes: 19 additions & 0 deletions services/budeval/budeval/evals/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ class EvaluationEngine(str, Enum):
OPENCOMPASS = "opencompass"


class EvalMode(str, Enum):
"""Supported evaluation modes."""

GEN = "gen"
PPL = "ppl"


# --- Evaluation Job Info --- #
class EvalModelInfo(BaseModel):
"""Model information for evaluation."""
Expand All @@ -44,6 +51,12 @@ class EvalDataset(BaseModel):

dataset_id: str = Field(..., description="ID of the dataset to be evaluated")
run_id: str = Field(..., description="ID of the run to be evaluated")
eval_mode: Optional[EvalMode] = Field(
default=None,
description=(
"Evaluation mode for the dataset (generation or perplexity); defaults to the request mode when omitted"
),
)


class EvalConfig(BaseModel):
Expand All @@ -64,6 +77,12 @@ class EvaluationRequest(CloudEventBase):
# Nested model info structure
eval_model_info: EvalModelInfo = Field(..., description="Model information for evaluation")

# Evaluation mode
eval_mode: EvalMode = Field(
default=EvalMode.GEN,
description="Evaluation mode (e.g., generation or perplexity)",
)

# Structured datasets instead of simple strings
eval_datasets: List[EvalDataset] = Field(..., description="Evaluation datasets")

Expand Down
Loading