Pre-commit fixes

Keshav Ramji Keshav.Ramji@ibm.com · Keshav Ramji Keshav.Ramji@ibm.com · commit 6c7ad2973002 · 2025-11-25T18:16:01.000Z
diff --git a/cli/eval/commands.py b/cli/eval/commands.py
@@ -9,19 +9,22 @@ def eval_run(
     ),
     backend: str = typer.Option("ollama", "--backend", "-b", help="Generation backend"),
     model: str = typer.Option(None, "--model", help="Generation model name"),
-    max_gen_tokens: int = typer.Option(256, "--max-gen-tokens", help="Max tokens to generate for responses"),
+    max_gen_tokens: int = typer.Option(
+        256, "--max-gen-tokens", help="Max tokens to generate for responses"
+    ),
     judge_backend: str = typer.Option(
         None, "--judge-backend", "-jb", help="Judge backend"
     ),
     judge_model: str = typer.Option(None, "--judge-model", help="Judge model name"),
-    max_judge_tokens: int = typer.Option(256, "--max-judge-tokens", help="Max tokens for the judge model's judgement."),
+    max_judge_tokens: int = typer.Option(
+        256, "--max-judge-tokens", help="Max tokens for the judge model's judgement."
+    ),
     output_path: str = typer.Option(
         "eval_results", "--output-path", "-o", help="Output path for results"
     ),
     output_format: str = typer.Option(
         "json", "--output-format", help="Either json or jsonl format for results"
     ),
-    verbose: bool = typer.Option(False, "--verbose", "-v"),
     continue_on_error: bool = typer.Option(True, "--continue-on-error"),
 ):
     from cli.eval.runner import run_evaluations
@@ -36,7 +39,6 @@ def eval_run(
         max_judge_tokens=max_judge_tokens,
         output_path=output_path,
         output_format=output_format,
-        verbose=verbose,
         continue_on_error=continue_on_error,
     )
 
diff --git a/cli/eval/runner.py b/cli/eval/runner.py
@@ -23,7 +23,7 @@ def __init__(
         model_output: str,
         validation_passed: bool,
         score: int,
-        validation_reason: str, # add input_id
+        validation_reason: str,  # add input_id
     ):
         self.input_text = input_text
         self.model_output = model_output
@@ -74,7 +74,9 @@ def pass_rate(self) -> float:
         return self.passed_count / self.total_count if self.total_count > 0 else 0.0
 
 
-def create_session(backend: str, model: str | None, max_tokens: int | None) -> mellea.MelleaSession:
+def create_session(
+    backend: str, model: str | None, max_tokens: int | None
+) -> mellea.MelleaSession:
     """Create a mellea session with the specified backend and  model."""
 
     model_id = None
@@ -96,35 +98,40 @@ def create_session(backend: str, model: str | None, max_tokens: int | None) -> m
             from mellea.backends.ollama import OllamaModelBackend
 
             backend_instance = OllamaModelBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower == "openai":
             from mellea.backends.openai import OpenAIBackend
 
             backend_instance = OpenAIBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower in ["hf", "huggingface"]:
             from mellea.backends.huggingface import LocalHFBackend
 
             backend_instance = LocalHFBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower == "watsonx":
             from mellea.backends.watsonx import WatsonxAIBackend
 
             backend_instance = WatsonxAIBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         elif backend_lower == "litellm":
             from mellea.backends.litellm import LiteLLMBackend
 
             backend_instance = LiteLLMBackend(
-                model_id=model_id, model_options={ModelOption.MAX_NEW_TOKENS: max_tokens}
+                model_id=model_id,
+                model_options={ModelOption.MAX_NEW_TOKENS: max_tokens},
             )
 
         else:
@@ -135,9 +142,7 @@ def create_session(backend: str, model: str | None, max_tokens: int | None) -> m
         # create session with backend instance
         from mellea.stdlib.base import SimpleContext
 
-        session = mellea.MelleaSession(
-            backend=backend_instance, ctx=SimpleContext()
-        ) 
+        session = mellea.MelleaSession(backend=backend_instance, ctx=SimpleContext())
         return session
 
     except Exception as e:
@@ -157,7 +162,6 @@ def run_evaluations(
     max_judge_tokens: int | None,
     output_path: str,
     output_format: str,
-    verbose: bool,
     continue_on_error: bool,
 ):
     """Run all 'unit test' evaluations"""
@@ -176,14 +180,16 @@ def run_evaluations(
         return
 
     console.print(f"Total test evals to run: {len(all_test_evals)}")
-    total_inputs = sum(len(te.inputs) for te in all_test_evals)
+    total_inputs = sum(len(test_eval.inputs) for test_eval in all_test_evals)
     console.print(f"Total inputs to run: {total_inputs}")
 
     console.print(f"Generation model: {model}")
     console.print(f"Judge model: {judge_model}")
 
     m = create_session(backend=backend, model=model, max_tokens=max_gen_tokens)
-    judge_session = create_session(backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens)
+    judge_session = create_session(
+        backend=judge_backend, model=judge_model, max_tokens=max_judge_tokens
+    )
 
     all_results = []
 
@@ -234,12 +240,14 @@ def execute_test_eval(
         result: ModelOutputThunk = generation_session.act(input_text)
         model_output = str(result)
 
-        judge_session.ctx = judge_session.ctx.add(result)
-
-        targets_for_input = (test_eval.targets[idx] if idx < len(test_eval.targets) else [])
+        targets_for_input = (
+            test_eval.targets[idx] if idx < len(test_eval.targets) else []
+        )
 
         # query the judge
-        judge_prompt = create_judge_requirement(test_eval, input_text, model_output, targets_for_input)
+        judge_prompt = create_judge_requirement(
+            test_eval, input_text, model_output, targets_for_input
+        )
         judge_output_thunk = judge_session.act(judge_prompt)
         judge_output = str(judge_output_thunk)
         score, justification = parse_judge_output(judge_output)
@@ -263,7 +271,10 @@ def execute_test_eval(
 
 
 def create_judge_requirement(
-    test_eval: TestBasedEval, input_text: str, model_output: str, targets_for_input: list[str]
+    test_eval: TestBasedEval,
+    input_text: str,
+    model_output: str,
+    targets_for_input: list[str],
 ):
     """Create judge requirement description"""
 
diff --git a/mellea/stdlib/reqlib/md.py b/mellea/stdlib/reqlib/md.py
@@ -14,11 +14,15 @@ def as_markdown_list(ctx: Context) -> list[str] | None:
     raw_output = ctx.last_output()
     assert raw_output is not None
     try:
-        parsed = mistletoe.Document(raw_output.value)  # type: ignore
-        for child in parsed.children:  # type: ignore
+        assert raw_output.value is not None
+        parsed = mistletoe.Document(raw_output.value)
+        assert parsed.children is not None
+        children = list(parsed.children)
+        for child in children:
             if type(child) is not mistletoe.block_token.List:
                 return None
-        for item in child.children:  # type: ignore
+        assert child.children is not None
+        for item in child.children:
             xs.append(mistletoe.base_renderer.BaseRenderer().render(item))
         return xs
     except Exception:
@@ -44,10 +48,13 @@ def _md_table(ctx: Context):
     raw_output = ctx.last_output()
     assert raw_output is not None
     try:
-        parsed = mistletoe.Document(raw_output.value)  # type: ignore
-        if len(parsed.children) != 1:  # type: ignore
+        assert raw_output.value is not None
+        parsed = mistletoe.Document(raw_output.value)
+        assert parsed.children is not None
+        children = list(parsed.children)
+        if len(children) != 1:
             return False
-        return type(parsed.children[0]) is mistletoe.block_token.Table  # type: ignore
+        return type(children[0]) is mistletoe.block_token.Table
     except Exception:
         return False
 
diff --git a/mellea/stdlib/test_based_eval.py b/mellea/stdlib/test_based_eval.py
@@ -18,7 +18,7 @@ def __init__(
         inputs: list[str],
         targets: list[list[str]] | None = None,  # can be optional
         test_id: str | None = None,
-        input_ids: list[str] | None = None
+        input_ids: list[str] | None = None,
     ):
         """Initialize TestBasedEval (for a single unit test)."""
         self.source = source
@@ -61,7 +61,7 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
         """Load test evaluations from json/jsonl file, return list of TestBasedEval instances, one per 'unit test'."""
         path = Path(filepath)
 
-        with path.open('r') as f:
+        with path.open("r") as f:
             data = json.load(f)
 
         if not isinstance(data, list):
@@ -77,12 +77,18 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
 
             for example in examples:
                 input_messages = example.get("input", [])
-                user_messages = [msg for msg in input_messages if msg.get("role") == "user"]
+                user_messages = [
+                    msg for msg in input_messages if msg.get("role") == "user"
+                ]
                 if user_messages:
                     inputs.append(user_messages[-1].get("content", ""))
 
                 target_messages = example.get("targets", [])
-                targets_for_input = [msg.get("content", "") for msg in target_messages if msg.get("role") == "assistant"]
+                targets_for_input = [
+                    msg.get("content", "")
+                    for msg in target_messages
+                    if msg.get("role") == "assistant"
+                ]
                 targets.append(targets_for_input)
 
                 input_ids.append(example.get("input_id", ""))
@@ -94,8 +100,8 @@ def from_json_file(cls, filepath: str) -> list["TestBasedEval"]:
                 inputs=inputs,
                 targets=targets,
                 test_id=test_data.get("id", ""),
-                input_ids=input_ids
+                input_ids=input_ids,
             )
             test_evals.append(test_eval)
 
-        return test_evals
+        return test_evals