diff --git a/cookbooks/grader_validation/rewardbench2.py b/cookbooks/grader_validation/rewardbench2.py
index 7189cdce..45431d63 100644
--- a/cookbooks/grader_validation/rewardbench2.py
+++ b/cookbooks/grader_validation/rewardbench2.py
@@ -307,8 +307,10 @@ async def _evaluate_four_way(
             GraderScore: Result with score=1.0 if predicted best answer matches ground truth
         """
         # Handle None case for mutable arguments
-        answers = answers if answers is not None else []
-        chosen_indices = chosen_indices if chosen_indices is not None else []
+        if not answers:
+            answers = []
+        if not chosen_indices:
+            chosen_indices = []
 
         # Ensure we have exactly 4 answers
         if len(answers) < 4:
@@ -402,8 +404,10 @@ async def _evaluate_ties(
             GraderScore: Result with score=1.0 if any top-rated answer is in chosen_indices
         """
         # Handle None case for mutable arguments
-        answers = answers if answers is not None else []
-        chosen_indices = chosen_indices if chosen_indices is not None else []
+        if not answers:
+            answers = []
+        if not chosen_indices:
+            chosen_indices = []
 
         correct_indices = set(chosen_indices)
 
diff --git a/cookbooks/training_judge_model/bradley-terry/dataset.py b/cookbooks/training_judge_model/bradley-terry/dataset.py
index 3baf3c6e..8ee3d8cc 100644
--- a/cookbooks/training_judge_model/bradley-terry/dataset.py
+++ b/cookbooks/training_judge_model/bradley-terry/dataset.py
@@ -120,7 +120,7 @@ def _tokenize_messages(self, messages: List[Dict[str, str]]) -> Dict[str, torch.
         # Handle sequence length like SFT dataset
         if sequence_length < self.max_length:
             # Pad sequences
-            pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0
+            pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else 0
             padded_input_ids = (
                 torch.ones(
                     size=(self.max_length - sequence_length,),
diff --git a/cookbooks/training_judge_model/bradley-terry/trainer.py b/cookbooks/training_judge_model/bradley-terry/trainer.py
index d21aefb8..3228de28 100644
--- a/cookbooks/training_judge_model/bradley-terry/trainer.py
+++ b/cookbooks/training_judge_model/bradley-terry/trainer.py
@@ -163,7 +163,7 @@ def _build_model_optimizer(self):
             verbose=True,
         )
 
-        if self.config.model.external_lib is not None:
+        if self.config.model.external_lib:
             import importlib
 
             importlib.import_module(self.config.model.external_lib)
@@ -538,11 +538,10 @@ def fit(self):
         last_valid_metric = None
         latest_train_metric = {}
 
-        total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
-        if self.config.trainer.total_training_steps is not None:
-            total_training_steps = self.config.trainer.total_training_steps
-
-        self.total_training_steps = total_training_steps
+        if self.config.trainer.total_training_steps:
+            self.total_training_steps = self.config.trainer.total_training_steps
+        else:
+            self.total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
         print(f"Total training steps: {self.total_training_steps}")
 
         # Create a single progress bar for all training steps
@@ -690,7 +689,7 @@ def run_bt_training(config):
     )
 
     # Ensure pad token exists
-    if tokenizer.pad_token is None:
+    if not tokenizer.pad_token:
         tokenizer.pad_token = tokenizer.eos_token
 
     # Create datasets
diff --git a/openjudge/analyzer/statistical/consistency_analyzer.py b/openjudge/analyzer/statistical/consistency_analyzer.py
index 379a8184..3eebb568 100644
--- a/openjudge/analyzer/statistical/consistency_analyzer.py
+++ b/openjudge/analyzer/statistical/consistency_analyzer.py
@@ -101,21 +101,24 @@ def analyze(
             >>> print(f"Consistency: {result.consistency:.2f}")
             Consistency: 0.99
         """
-        # Handle the case where the method is called with the old signature
-        # i.e., analyze(first_run_results, second_run_results)
-        first_run_results = grader_results
-        second_run_results = another_grader_results
-
-        # If the parameters were passed positionally as before, dataset will be first_run_results
-        # and grader_results will be second_run_results
-        if first_run_results is None and second_run_results is None:
-            if dataset is not None and grader_results is not None:
-                first_run_results = dataset
-                second_run_results = grader_results
-            else:
-                # If still not set, use empty lists
-                first_run_results = []
-                second_run_results = []
+        # Need to support old 2-argment call signagure: analyze(first_run_results, second_run_results)
+        # Need to determine which argment is the 1st run result and which is the 2nd run result.
+        if grader_results and another_grader_results:
+            # current call signature
+            first_run_results = grader_results
+            second_run_results = another_grader_results
+        elif dataset and grader_results:
+            # The first two argments contain values but the 3rd does not.
+            # Treat this as a call following the old 2-argument signature.
+            first_run_results = dataset
+            second_run_results = grader_results
+        else:
+            # 1. Insufficient argments for the current call signature:
+            # dataset and another grader result exist,
+            # but the 2nd argment (grader result) does not have value.
+            # Or 2. none of dataset, grader_results, another_grader_results exists.
+            first_run_results = []
+            second_run_results = []
 
         if not first_run_results or not second_run_results:
             logger.warning(
diff --git a/openjudge/generator/iterative_rubric/categorizer.py b/openjudge/generator/iterative_rubric/categorizer.py
index fa08fb5d..d599d401 100644
--- a/openjudge/generator/iterative_rubric/categorizer.py
+++ b/openjudge/generator/iterative_rubric/categorizer.py
@@ -238,7 +238,7 @@ async def categorize_rubrics(
             >>> categorized_rubrics, info = await categorizer.categorize_rubrics(rubrics)
         """
 
-        if len(rubrics) == 0:
+        if not rubrics:
             logger.error("Input rubrics list is empty")
             return [], {
                 "categorization_successful": False,
diff --git a/openjudge/generator/iterative_rubric/generator.py b/openjudge/generator/iterative_rubric/generator.py
index 94b6f0f0..b4ccbb29 100644
--- a/openjudge/generator/iterative_rubric/generator.py
+++ b/openjudge/generator/iterative_rubric/generator.py
@@ -265,7 +265,7 @@ async def generate(
             grader_kwargs["max_score"] = self.config.max_score
 
         # Add custom template if provided
-        if hasattr(self.config, "custom_evaluation_prompt") and self.config.custom_evaluation_prompt is not None:
+        if hasattr(self.config, "custom_evaluation_prompt") and self.config.custom_evaluation_prompt:
             grader_kwargs["template"] = self.config.custom_evaluation_prompt
 
         return LLMGrader(**grader_kwargs)
diff --git a/openjudge/graders/agent/action/action_alignment.py b/openjudge/graders/agent/action/action_alignment.py
index 13f4016c..c083a11a 100644
--- a/openjudge/graders/agent/action/action_alignment.py
+++ b/openjudge/graders/agent/action/action_alignment.py
@@ -185,10 +185,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate action alignment with plan",
             model=model,
-            template=template,
+            template=template or DEFAULT_ACTION_ALIGNMENT_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_ACTION_ALIGNMENT_TEMPLATE
 
     def _format_history(self, history: Optional[list] = None) -> str:
         """Format history steps for evaluation.
diff --git a/openjudge/graders/agent/memory/memory_detail_preservation.py b/openjudge/graders/agent/memory/memory_detail_preservation.py
index 8d64a9e9..28b3eb6c 100644
--- a/openjudge/graders/agent/memory/memory_detail_preservation.py
+++ b/openjudge/graders/agent/memory/memory_detail_preservation.py
@@ -176,10 +176,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate memory detail preservation",
             model=model,
-            template=template,
+            template=template or DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE
 
     def _format_history(self, history: Optional[list] = None) -> str:
         """Format history steps for evaluation.
diff --git a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
index 50256a16..2cb9e5b1 100644
--- a/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
+++ b/openjudge/graders/agent/memory/memory_retrieval_effectiveness.py
@@ -179,10 +179,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate memory retrieval effectiveness",
             model=model,
-            template=template,
+            template=template or DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE
 
     def _format_history(self, history: Optional[list] = None) -> str:
         """Format history steps for evaluation.
diff --git a/openjudge/graders/agent/plan/plan_feasibility.py b/openjudge/graders/agent/plan/plan_feasibility.py
index 5104988a..ebf4dd68 100644
--- a/openjudge/graders/agent/plan/plan_feasibility.py
+++ b/openjudge/graders/agent/plan/plan_feasibility.py
@@ -179,10 +179,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate plan feasibility",
             model=model,
-            template=template,
+            template=template or DEFAULT_PLAN_FEASIBILITY_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_PLAN_FEASIBILITY_TEMPLATE
 
     def _format_history(self, history: Optional[list] = None) -> str:
         """Format history steps for evaluation.
diff --git a/openjudge/graders/agent/reflection/reflection_accuracy.py b/openjudge/graders/agent/reflection/reflection_accuracy.py
index 0697d569..296cfab7 100644
--- a/openjudge/graders/agent/reflection/reflection_accuracy.py
+++ b/openjudge/graders/agent/reflection/reflection_accuracy.py
@@ -176,10 +176,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate reflection accuracy",
             model=model,
-            template=template,
+            template=template or DEFAULT_REFLECTION_ACCURACY_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_REFLECTION_ACCURACY_TEMPLATE
 
     def _format_history(self, history: Optional[list] = None) -> str:
         """Format history steps for evaluation.
diff --git a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
index c4cec5b4..6a15b2ef 100644
--- a/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
+++ b/openjudge/graders/agent/reflection/reflection_outcome_understanding.py
@@ -300,10 +300,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate reflection outcome understanding",
             model=model,
-            template=template,
+            template=template or DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE
 
     def _format_history(self, history: Optional[list] = None) -> str:
         """Format history steps for evaluation.
diff --git a/openjudge/graders/agent/reflection/reflection_progress_awareness.py b/openjudge/graders/agent/reflection/reflection_progress_awareness.py
index e434443b..896b2cb3 100644
--- a/openjudge/graders/agent/reflection/reflection_progress_awareness.py
+++ b/openjudge/graders/agent/reflection/reflection_progress_awareness.py
@@ -217,10 +217,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate reflection progress awareness",
             model=model,
-            template=template,
+            template=template or DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE
 
     def _format_history(self, history: Optional[list] = None) -> str:
         """Format history steps for evaluation.
diff --git a/openjudge/graders/agent/tool/tool_call_accuracy.py b/openjudge/graders/agent/tool/tool_call_accuracy.py
index 1466f627..7551dfeb 100644
--- a/openjudge/graders/agent/tool/tool_call_accuracy.py
+++ b/openjudge/graders/agent/tool/tool_call_accuracy.py
@@ -209,10 +209,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluates the accuracy of tool calls made by an agent",
             model=model,
-            template=template,
+            template=template or DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE
 
     def _parse_tools_from_response(
         self,
diff --git a/openjudge/graders/agent/tool/tool_parameter_check.py b/openjudge/graders/agent/tool/tool_parameter_check.py
index 948ffefa..442719b3 100644
--- a/openjudge/graders/agent/tool/tool_parameter_check.py
+++ b/openjudge/graders/agent/tool/tool_parameter_check.py
@@ -189,10 +189,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate tool parameter extraction correctness",
             model=model,
-            template=template,
+            template=template or DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE
 
     async def aevaluate(
         self,
diff --git a/openjudge/graders/agent/tool/tool_selection.py b/openjudge/graders/agent/tool/tool_selection.py
index 39da2955..dec310b1 100644
--- a/openjudge/graders/agent/tool/tool_selection.py
+++ b/openjudge/graders/agent/tool/tool_selection.py
@@ -202,10 +202,9 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate tool selection ",
             model=model,
-            template=template,
+            template=template or DEFAULT_TOOL_SELECTION_TEMPLATE,
             language=language,
         )
-        self.template = template if template is not None else DEFAULT_TOOL_SELECTION_TEMPLATE
 
     async def aevaluate(
         self,
diff --git a/openjudge/graders/agent/trajectory/__init__.py b/openjudge/graders/agent/trajectory/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/openjudge/graders/code/_utils/testing_util.py b/openjudge/graders/code/_utils/testing_util.py
index 797b3e59..b8156ac3 100644
--- a/openjudge/graders/code/_utils/testing_util.py
+++ b/openjudge/graders/code/_utils/testing_util.py
@@ -154,6 +154,10 @@ def run_test(in_outs, test=None, timeout=15):
     """
     # Disable functionalities that can make destructive changes to the test.
     reliability_guard()
+
+    if not test:
+        raise AssertionError("should not happen: missing test code input")
+
     method_name = None
     tmp = None
     which_type = None
@@ -170,9 +174,6 @@ def run_test(in_outs, test=None, timeout=15):
 
     logger.debug(f"loaded input_output = {datetime.now().time()}")
 
-    if test is None:
-        raise AssertionError("should not happen: test code is none")
-
     results = []
     sol = """from string import *
 from re import *
diff --git a/openjudge/graders/common/hallucination.py b/openjudge/graders/common/hallucination.py
index f793e0b5..5a35c319 100644
--- a/openjudge/graders/common/hallucination.py
+++ b/openjudge/graders/common/hallucination.py
@@ -265,11 +265,10 @@ def __init__(
             mode=GraderMode.POINTWISE,
             description="Evaluate whether response contains hallucinations",
             model=model,
-            template=template,
+            template=template or DEFAULT_HALLUCINATION_TEMPLATE,
             language=language,
         )
         self.threshold = threshold
-        self.template = template if template is not None else DEFAULT_HALLUCINATION_TEMPLATE
 
     async def aevaluate(
         self,
diff --git a/openjudge/graders/multimodal/_internal/criteria_utils.py b/openjudge/graders/multimodal/_internal/criteria_utils.py
index 452cde55..ef71e74e 100644
--- a/openjudge/graders/multimodal/_internal/criteria_utils.py
+++ b/openjudge/graders/multimodal/_internal/criteria_utils.py
@@ -81,7 +81,7 @@ def validate_and_sort_rubrics(
         ... ]
         >>> sorted_rubrics = validate_and_sort_rubrics(rubrics)
     """
-    if rubrics is None:
+    if not rubrics:
         return None
 
     # Sort rubrics by start of range
@@ -120,7 +120,7 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
         0-3: Poor quality
         7-10: High quality
     """
-    if rubrics is None:
+    if not rubrics:
         return None
 
     return "\n".join(
@@ -177,7 +177,7 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
         >>> get_score_range(rubrics)
         (0, 10)
     """
-    if rubric is None:
+    if not rubric:
         return (0, 10)
 
     return rubric[0].score_range[0], rubric[-1].score_range[1]
diff --git a/openjudge/models/formatter/dashscope_formatter.py b/openjudge/models/formatter/dashscope_formatter.py
index b0b3debd..b445ae8f 100644
--- a/openjudge/models/formatter/dashscope_formatter.py
+++ b/openjudge/models/formatter/dashscope_formatter.py
@@ -75,8 +75,8 @@ def _convert_content_to_openai(
         Returns:
             Content in OpenAI format.
         """
-        # If content is None, return empty string
-        if content is None:
+        # Return empty string if no content input
+        if not content:
             return ""
 
         # If content is a string, return as is
@@ -135,8 +135,8 @@ def _convert_content_to_dashscope(
         Returns:
             Content in DashScope format.
         """
-        # If content is None, return empty string
-        if content is None:
+        # Return empty string if no content input
+        if not content:
             return ""
 
         # If content is a string, return as is
diff --git a/openjudge/models/schema/prompt_template.py b/openjudge/models/schema/prompt_template.py
index cf3018bd..ea39ef81 100644
--- a/openjudge/models/schema/prompt_template.py
+++ b/openjudge/models/schema/prompt_template.py
@@ -184,7 +184,7 @@ def to_messages(
         if isinstance(self.messages, list):
             messages = self.messages
         elif isinstance(self.messages, dict):
-            if language is None:
+            if not language:
                 language = LanguageEnum.EN
             assert language in self.messages
             messages = self.messages.get(language, [])
diff --git a/openjudge/runner/aggregator/base_aggregator.py b/openjudge/runner/aggregator/base_aggregator.py
index 97cae548..979f493a 100644
--- a/openjudge/runner/aggregator/base_aggregator.py
+++ b/openjudge/runner/aggregator/base_aggregator.py
@@ -37,12 +37,12 @@ def __name__(self):
         return self.name
 
     @abstractmethod
-    def __call__(self, results: Dict[str, GraderResult], **kwargs) -> GraderResult:
+    def __call__(self, grader_results: Dict[str, GraderResult], **kwargs) -> GraderResult:
         """
         Aggregate results from multiple graders for a single sample.
 
         Args:
-            results: Dictionary mapping grader names to GraderResult objects for a single sample
+            grader_results: Dictionary mapping grader names to GraderResult objects for a single sample
             **kwargs: Additional arguments for aggregation
 
         Returns:
diff --git a/openjudge/runner/aggregator/weighted_sum_aggregator.py b/openjudge/runner/aggregator/weighted_sum_aggregator.py
index 96646804..e7c0dadb 100644
--- a/openjudge/runner/aggregator/weighted_sum_aggregator.py
+++ b/openjudge/runner/aggregator/weighted_sum_aggregator.py
@@ -28,27 +28,27 @@ def __init__(self, name: str, weights: Dict[str, float] = None):
         super().__init__(name)
         self.weights = weights or {}
 
-    def __call__(self, results: Dict[str, GraderResult], **kwargs) -> GraderResult:
+    def __call__(self, grader_results: Dict[str, GraderResult], **kwargs) -> GraderResult:
         """
-        Aggregate results using weighted sum for a single sample.
+        Aggregate multiple grader results using weighted sum for a single sample.
 
         Args:
-            results: Dictionary mapping grader names to GraderResult objects for a single sample
+            grader_results: Dictionary mapping grader names to GraderResult objects for a single sample
             **kwargs: Additional arguments (unused)
 
         Returns:
             Aggregated result as a GraderResult object
         """
-        if not results:
+        if not grader_results:
             return GraderError(
                 name=self.name,
-                reason="No results to aggregate",
-                error="No results provided for aggregation",
+                reason="No grader result to aggregate",
+                error="No grader result provided for aggregation",
             )
 
         # Initialize weights if not provided (equal weights)
         if not self.weights:
-            grader_names = list(results.keys())
+            grader_names = list(grader_results.keys())
             equal_weight = 1.0 / len(grader_names) if grader_names else 0.0
             weights = {name: equal_weight for name in grader_names}
         else:
@@ -59,8 +59,8 @@ def __call__(self, results: Dict[str, GraderResult], **kwargs) -> GraderResult:
         component_scores = {}
 
         # Collect scores from all graders for this sample
-        for grader_name, result in results.items():
-            # Only process GraderScore results (skip errors, ranks, etc.)
+        for grader_name, result in grader_results.items():
+            # Only process results of GraderScore type (skip errors, ranks, etc.)
             if isinstance(result, GraderScore):
                 weight = weights.get(grader_name, 0.0)
                 weighted_sum += result.score * weight
diff --git a/openjudge/runner/grading_runner.py b/openjudge/runner/grading_runner.py
index b293ac53..ef678a6c 100644
--- a/openjudge/runner/grading_runner.py
+++ b/openjudge/runner/grading_runner.py
@@ -177,7 +177,7 @@ def __init__(
         concurrency_manager.set_max_concurrency(max_concurrency)
 
         # Handle aggregators
-        if aggregators is None:
+        if not aggregators:
             self.aggregators = []
         elif isinstance(aggregators, BaseAggregator):
             self.aggregators = [aggregators]
@@ -345,7 +345,7 @@ async def arun(
         if self.show_progress:
             all_results = await tqdm_asyncio.gather(
                 *all_coroutines,
-                desc="Grading",
+                desc="Evaluating a dataset",
                 total=len(all_coroutines),
             )
         else:
@@ -472,7 +472,7 @@ async def arun_multiple_datasets(
             if original_show_progress:
                 all_results = await tqdm_asyncio.gather(
                     *tasks,
-                    desc="Grading Datasets",
+                    desc=f"Evaluating {len(tasks)} datasets",
                     total=len(tasks),
                 )
             else:
diff --git a/openjudge/utils/concurrency.py b/openjudge/utils/concurrency.py
index 8c41a199..82b3701e 100644
--- a/openjudge/utils/concurrency.py
+++ b/openjudge/utils/concurrency.py
@@ -19,7 +19,7 @@ class ConcurrencyManager:
     _instance = None
 
     def __new__(cls):
-        if cls._instance is None:
+        if not cls._instance:
             cls._instance = super(ConcurrencyManager, cls).__new__(cls)
         return cls._instance
 
diff --git a/openjudge/utils/instance.py b/openjudge/utils/instance.py
index 50acbcaa..6e893896 100644
--- a/openjudge/utils/instance.py
+++ b/openjudge/utils/instance.py
@@ -80,7 +80,7 @@ class should be subclass of. If provided, will check
     # If config is already an instance, just check its type
     if not isinstance(config, dict):
         instance = config
-        if accept_type is not None and not isinstance(instance, accept_type):
+        if accept_type and not isinstance(instance, accept_type):
             raise TypeError(
                 f"Provided instance {instance.__class__.__name__} " f"is not an instance of {accept_type.__name__}",
             )
@@ -98,7 +98,7 @@ class should be subclass of. If provided, will check
     cls = getattr(module, class_name)
 
     # Check type if accept_type is provided
-    if accept_type is not None and not issubclass(cls, accept_type):
+    if accept_type and not issubclass(cls, accept_type):
         raise TypeError(
             f"Instantiated class {cls.__name__} is not a subclass of {accept_type.__name__}",
         )
diff --git a/tests/runner/aggregator/test_weighted_sum_aggregator.py b/tests/runner/aggregator/test_weighted_sum_aggregator.py
index 61eda44b..f6ace6b6 100644
--- a/tests/runner/aggregator/test_weighted_sum_aggregator.py
+++ b/tests/runner/aggregator/test_weighted_sum_aggregator.py
@@ -31,12 +31,12 @@ def test_initialization(self):
     def test_empty_results(self):
         """Test aggregation with empty results"""
         aggregator = WeightedSumAggregator(name="test_agg")
-        result = aggregator(results={})
+        result = aggregator(grader_results={})
 
         assert isinstance(result, GraderError)
         assert result.name == "test_agg"
-        assert result.reason == "No results to aggregate"
-        assert result.error == "No results provided for aggregation"
+        assert result.reason == "No grader result to aggregate"
+        assert result.error == "No grader result provided for aggregation"
 
     def test_equal_weight_aggregation(self):
         """Test aggregation with equal weights (default behavior)"""
@@ -48,7 +48,7 @@ def test_equal_weight_aggregation(self):
             "grader3": GraderScore(name="grader3", score=4.0, reason="Poor response"),
         }
 
-        aggregated_result = aggregator(results=results)
+        aggregated_result = aggregator(grader_results=results)
 
         assert isinstance(aggregated_result, GraderScore)
         assert aggregated_result.name == "test_agg"
@@ -67,7 +67,7 @@ def test_weighted_aggregation(self):
             "grader3": GraderScore(name="grader3", score=0.0, reason="Poor"),
         }
 
-        aggregated_result = aggregator(results=results)
+        aggregated_result = aggregator(grader_results=results)
 
         assert isinstance(aggregated_result, GraderScore)
         assert aggregated_result.name == "test_agg"
@@ -85,7 +85,7 @@ def test_mixed_result_types(self):
             "rank_grader": GraderRank(name="rank_grader", rank=[1, 2, 3], reason="Ranked"),
         }
 
-        aggregated_result = aggregator(results=results)
+        aggregated_result = aggregator(grader_results=results)
 
         assert isinstance(aggregated_result, GraderScore)
         assert aggregated_result.name == "test_agg"
@@ -105,7 +105,7 @@ def test_zero_weight_aggregation(self):
             "grader2": GraderScore(name="grader2", score=5.0, reason="Average"),
         }
 
-        aggregated_result = aggregator(results=results)
+        aggregated_result = aggregator(grader_results=results)
 
         assert isinstance(aggregated_result, GraderScore)
         assert aggregated_result.name == "test_agg"
@@ -123,7 +123,7 @@ def test_missing_weights(self):
         }
 
         # grader2 should get default weight of 0.0
-        aggregated_result = aggregator(results=results)
+        aggregated_result = aggregator(grader_results=results)
 
         assert isinstance(aggregated_result, GraderScore)
         assert aggregated_result.name == "test_agg"