diff --git a/bugbug/code_search/mozilla.py b/bugbug/code_search/mozilla.py
index b80c25db0a..1c83af4578 100644
--- a/bugbug/code_search/mozilla.py
+++ b/bugbug/code_search/mozilla.py
@@ -10,10 +10,10 @@
 
 
 class FunctionSearchMozilla(FunctionSearch):
-    def __init__(self, repo_dir, get_file, fast=False):
+    def __init__(self, repo_dir, get_file=None, fast=False):
         super().__init__()
         self.repo_dir = repo_dir
-        self.get_file = get_file
+        self.get_file = get_file or FunctionSearchSearchfoxAPI._get_file
         self.fast = fast
 
     def get_function(
diff --git a/bugbug/code_search/searchfox_api.py b/bugbug/code_search/searchfox_api.py
index 9aa7b76d5a..56d6579943 100644
--- a/bugbug/code_search/searchfox_api.py
+++ b/bugbug/code_search/searchfox_api.py
@@ -214,9 +214,18 @@ def search(commit_hash, symbol_name):
 
 
 class FunctionSearchSearchfoxAPI(FunctionSearch):
-    def __init__(self, get_file):
+    def __init__(self, get_file=None):
         super().__init__()
-        self.get_file = get_file
+        self.get_file = get_file or self._get_file
+
+    @staticmethod
+    def _get_file(commit_hash, path):
+        r = utils.get_session("hgmo").get(
+            f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}",
+            headers={"User-Agent": utils.get_user_agent()},
+        )
+        r.raise_for_status()
+        return r.text
 
     def definitions_to_results(self, commit_hash, definitions):
         result = []
diff --git a/bugbug/tools/base.py b/bugbug/tools/base.py
index c36b1a19c5..2680bd5f85 100644
--- a/bugbug/tools/base.py
+++ b/bugbug/tools/base.py
@@ -8,13 +8,5 @@
 
 
 class GenerativeModelTool(ABC):
-    @property
-    @abstractmethod
-    def version(self) -> str: ...
-
     @abstractmethod
     def run(self, *args, **kwargs) -> Any: ...
-
-    @staticmethod
-    def _print_answer(answer):
-        print(f"\u001b[33;1m\033[1;3m{answer}\u001b[0m")
diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py
index 203d95e2ba..32bd38c971 100644
--- a/bugbug/tools/code_review/agent.py
+++ b/bugbug/tools/code_review/agent.py
@@ -9,44 +9,39 @@
 import os
 from datetime import datetime
 from logging import getLogger
-from typing import Iterable, Optional
+from typing import Optional, Protocol
 
 from langchain.agents import create_agent
 from langchain.agents.structured_output import ProviderStrategy
 from langchain.chat_models import BaseChatModel
 from langchain.messages import HumanMessage
-from langchain_classic.chains import LLMChain
-from langchain_classic.prompts import PromptTemplate
 from langgraph.errors import GraphRecursionError
 from pydantic import BaseModel, Field
 from unidiff import PatchSet
 
 from bugbug.code_search.function_search import FunctionSearch
 from bugbug.tools.base import GenerativeModelTool
-from bugbug.tools.code_review.database import ReviewCommentsDB, SuggestionsFeedbackDB
+from bugbug.tools.code_review.database import ReviewCommentsDB
 from bugbug.tools.code_review.langchain_tools import (
     CodeReviewContext,
     create_find_function_definition_tool,
     expand_context,
 )
 from bugbug.tools.code_review.prompts import (
-    DEFAULT_REJECTED_EXAMPLES,
     FIRST_MESSAGE_TEMPLATE,
-    PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-    PROMPT_TEMPLATE_SUMMARIZATION,
     STATIC_COMMENT_EXAMPLES,
     SYSTEM_PROMPT_TEMPLATE,
     TEMPLATE_COMMENT_EXAMPLE,
     TEMPLATE_PATCH_FROM_HUNK,
 )
 from bugbug.tools.code_review.utils import (
+    convert_generated_comments_to_inline,
     format_patch_set,
-    generate_processed_output,
 )
 from bugbug.tools.core.data_types import InlineComment
 from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError
 from bugbug.tools.core.llms import get_tokenizer
-from bugbug.tools.core.platforms.base import Patch
+from bugbug.tools.core.platforms.base import Patch, ReviewData
 
 logger = getLogger(__name__)
 
@@ -73,25 +68,35 @@ class AgentResponse(BaseModel):
     )
 
 
-class CodeReviewTool(GenerativeModelTool):
-    version = "0.0.1"
+class PatchSummarizer(Protocol):
+    def run(self, patch: Patch) -> str: ...
+
+
+class SuggestionFilterer(Protocol):
+    def run(
+        self, suggestions: list[GeneratedReviewComment]
+    ) -> list[GeneratedReviewComment]: ...
+
 
+class CodeReviewTool(GenerativeModelTool):
     def __init__(
         self,
         llm: BaseChatModel,
-        summarization_llm: BaseChatModel,
-        filtering_llm: BaseChatModel,
+        patch_summarizer: PatchSummarizer,
+        suggestion_filterer: SuggestionFilterer,
+        review_data: ReviewData,
         function_search: Optional[FunctionSearch] = None,
         review_comments_db: Optional["ReviewCommentsDB"] = None,
         show_patch_example: bool = False,
         verbose: bool = True,
-        suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None,
         target_software: str = "Mozilla Firefox",
     ) -> None:
         super().__init__()
 
         self.target_software = target_software
 
+        self.review_data = review_data
+
         self._tokenizer = get_tokenizer(
             llm.model_name if hasattr(llm, "model_name") else ""
         )
@@ -108,24 +113,8 @@ def __init__(
                 "----------------------------------------------------"
             )
 
-        self.summarization_chain = LLMChain(
-            prompt=PromptTemplate.from_template(
-                PROMPT_TEMPLATE_SUMMARIZATION,
-                partial_variables={
-                    "experience_scope": f"the {self.target_software} source code"
-                },
-            ),
-            llm=summarization_llm,
-            verbose=verbose,
-        )
-        self.filtering_chain = LLMChain(
-            prompt=PromptTemplate.from_template(
-                PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-                partial_variables={"target_code_consistency": self.target_software},
-            ),
-            llm=filtering_llm,
-            verbose=verbose,
-        )
+        self.patch_summarizer = patch_summarizer
+        self.suggestion_filterer = suggestion_filterer
 
         tools = [expand_context]
         if function_search:
@@ -146,60 +135,77 @@ def __init__(
 
         self.verbose = verbose
 
-        self.suggestions_feedback_db = suggestions_feedback_db
-
     @staticmethod
-    def create(
-        llm=None, summarization_llm=None, filtering_llm=None, **kwargs
-    ) -> "CodeReviewTool":
-        from bugbug.tools.core.llms import create_anthropic_llm
-
-        return CodeReviewTool(
-            llm=llm
-            or create_anthropic_llm(
+    def create(**kwargs):
+        """Factory method to instantiate the tool with default dependencies.
+
+        This method takes the same parameters as the constructor, but all
+        parameters are optional. If a parameter is not provided, a default
+        component will be created and used.
+        """
+        if "function_search" not in kwargs:
+            from bugbug.code_search.searchfox_api import FunctionSearchSearchfoxAPI
+
+            kwargs["function_search"] = FunctionSearchSearchfoxAPI()
+
+        if "review_comments_db" not in kwargs:
+            from bugbug.tools.code_review.database import ReviewCommentsDB
+            from bugbug.vectordb import QdrantVectorDB
+
+            kwargs["review_comments_db"] = ReviewCommentsDB(
+                QdrantVectorDB("diff_comments")
+            )
+
+        if "review_data" not in kwargs:
+            from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData
+
+            kwargs["review_data"] = PhabricatorReviewData()
+
+        if "llm" not in kwargs:
+            from bugbug.tools.core.llms import create_anthropic_llm
+
+            kwargs["llm"] = create_anthropic_llm(
                 model_name="claude-opus-4-5-20251101",
                 max_tokens=40_000,
                 temperature=None,
                 thinking={"type": "enabled", "budget_tokens": 10_000},
-            ),
-            summarization_llm=summarization_llm or create_anthropic_llm(),
-            filtering_llm=filtering_llm or create_anthropic_llm(),
-            **kwargs,
-        )
+            )
 
-    def count_tokens(self, text):
-        return len(self._tokenizer.encode(text))
+        if "patch_summarizer" not in kwargs:
+            from bugbug.tools.patch_summarization.agent import PatchSummarizationTool
 
-    def generate_initial_prompt(self, patch: Patch) -> str:
-        formatted_patch = format_patch_set(patch.patch_set)
+            kwargs["patch_summarizer"] = PatchSummarizationTool.create()
 
-        output_summarization = self.summarization_chain.invoke(
-            {
-                "patch": formatted_patch,
-                "bug_title": patch.bug_title,
-                "patch_title": patch.patch_title,
-                "patch_description": patch.patch_description,
-            },
-            return_only_outputs=True,
-        )["text"]
+        if "suggestion_filterer" not in kwargs:
+            from bugbug.tools.suggestion_filtering.agent import SuggestionFilteringTool
 
-        if self.verbose:
-            GenerativeModelTool._print_answer(output_summarization)
+            kwargs["suggestion_filterer"] = SuggestionFilteringTool.create()
 
+        return CodeReviewTool(**kwargs)
+
+    def count_tokens(self, text):
+        return len(self._tokenizer.encode(text))
+
+    def generate_initial_prompt(self, patch: Patch, patch_summary: str) -> str:
         created_before = patch.date_created if self.is_experiment_env else None
+
         return FIRST_MESSAGE_TEMPLATE.format(
-            patch=formatted_patch,
-            patch_summarization=output_summarization,
+            patch=format_patch_set(patch.patch_set),
+            patch_summarization=patch_summary,
             comment_examples=self._get_comment_examples(patch, created_before),
             approved_examples=self._get_generated_examples(patch, created_before),
         )
 
-    def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]:
+    def generate_review_comments(
+        self, patch: Patch, patch_summary: str
+    ) -> list[GeneratedReviewComment]:
         try:
             for chunk in self.agent.stream(
                 {
                     "messages": [
-                        HumanMessage(self.generate_initial_prompt(patch)),
+                        HumanMessage(
+                            self.generate_initial_prompt(patch, patch_summary)
+                        ),
                     ]
                 },
                 context=CodeReviewContext(patch=patch),
@@ -212,35 +218,26 @@ def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]:
 
         return result["structured_response"].comments
 
+    def run_by_diff_id(self, diff_id: str) -> list[InlineComment] | None:
+        patch = self.review_data.get_patch_by_id(diff_id)
+        return self.run(patch)
+
     def run(self, patch: Patch) -> list[InlineComment] | None:
         if self.count_tokens(patch.raw_diff) > 21000:
             raise LargeDiffError("The diff is too large")
 
-        unfiltered_suggestions = self._generate_suggestions(patch)
+        patch_summary = self.patch_summarizer.run(patch)
+
+        unfiltered_suggestions = self.generate_review_comments(patch, patch_summary)
         if not unfiltered_suggestions:
             logger.info("No suggestions were generated")
             return []
 
-        rejected_examples = (
-            "\n    - ".join(self.get_similar_rejected_comments(unfiltered_suggestions))
-            if self.suggestions_feedback_db
-            else DEFAULT_REJECTED_EXAMPLES
-        )
-
-        raw_output = self.filtering_chain.invoke(
-            {
-                "comments": str(
-                    [comment.model_dump() for comment in unfiltered_suggestions]
-                ),
-                "rejected_examples": rejected_examples,
-            },
-            return_only_outputs=True,
-        )["text"]
-
-        if self.verbose:
-            GenerativeModelTool._print_answer(raw_output)
+        filtered_suggestions = self.suggestion_filterer.run(unfiltered_suggestions)
 
-        return list(generate_processed_output(raw_output, patch.patch_set))
+        return list(
+            convert_generated_comments_to_inline(filtered_suggestions, patch.patch_set)
+        )
 
     def _get_generated_examples(self, patch, created_before: datetime | None = None):
         """Get examples of comments that were generated by an LLM.
@@ -320,24 +317,3 @@ def generate_formatted_patch_from_raw_hunk(raw_hunk, filename):
             )
             for num, example in enumerate(comment_examples)
         )
-
-    def get_similar_rejected_comments(
-        self, suggestions: list[GeneratedReviewComment]
-    ) -> Iterable[str]:
-        if not self.suggestions_feedback_db:
-            raise Exception("Suggestions feedback database is not available")
-
-        num_examples_per_suggestion = 10 // len(suggestions) or 1
-        seen_ids: set[int] = set()
-
-        for suggestion in suggestions:
-            similar_rejected_suggestions = (
-                self.suggestions_feedback_db.find_similar_rejected_suggestions(
-                    suggestion.comment,
-                    limit=num_examples_per_suggestion,
-                    excluded_ids=seen_ids,
-                )
-            )
-            for rejected_suggestion in similar_rejected_suggestions:
-                seen_ids.add(rejected_suggestion.id)
-                yield rejected_suggestion.comment
diff --git a/bugbug/tools/code_review/prompts.py b/bugbug/tools/code_review/prompts.py
index 9b253c026b..7bffd5c442 100644
--- a/bugbug/tools/code_review/prompts.py
+++ b/bugbug/tools/code_review/prompts.py
@@ -7,31 +7,6 @@
 
 TARGET_SOFTWARE: str | None = None
 
-PROMPT_TEMPLATE_SUMMARIZATION = """You are an expert reviewer for {experience_scope}, with experience on source code reviews.
-
-Please, analyze the code provided and report a summarization about the new changes; for that, focus on the coded added represented by lines that start with "+".
-
-The summarization should have two parts:
-    1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request.
-    2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent.
-
-Do not include any code in the summarization, only a description of the changes.
-
-**Bug title**:
-<bug_title>
-{bug_title}
-</bug_title>
-
-**Commit message**:
-<commit_message>
-{patch_title}
-{patch_description}
-</commit_message>
-
-**Diff**:
-<patch>
-{patch}
-</patch>"""
 
 SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing a pull request and providing high-quality review comments. You will examine a code patch and generate constructive feedback focusing on potential issues in the changed code.
 
@@ -113,36 +88,6 @@
 {comments}"""
 
 
-PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that:
-- are consistent with the {target_code_consistency} source code;
-- focus on reporting possible bugs, functional regressions, issues, or similar concerns;
-- report readability or design concerns.
-
-Exclude comments that:
-- only describe the change;
-- restate obvious facts like renamed variables or replaced code;
-- include praising;
-- ask if changes are intentional or ask to ensure things exist.
-
-Only return a valid JSON list. Do not drop any key from the JSON objects.
-
-Comments:
-{comments}
-
-As examples of not expected comments, not related to the current patch, please, check some below:
-    - {rejected_examples}
-"""
-
-
-DEFAULT_REJECTED_EXAMPLES = """Please note that these are minor improvements and the overall quality of the patch is good. The documentation is being expanded in a clear and structured way, which will likely be beneficial for future development.
-    - Please note that these are just suggestions and the code might work perfectly fine as it is. It's always a good idea to test all changes thoroughly to ensure they work as expected.
-    - Overall, the patch seems to be well implemented with no major concerns. The developers have made a conscious decision to align with Chrome's behavior, and the reasoning is well documented.
-    - There are no complex code changes in this patch, so there's no potential for major readability regressions or bugs introduced by the changes.
-    - The `focus(...)` method is called without checking if the element and its associated parameters exist or not. It would be better to check if the element exists before calling the `focus()` method to avoid potential errors.
-    - It's not clear if the `SearchService.sys.mjs` file exists or not. If it doesn't exist, this could cause an error. Please ensure that the file path is correct.
-    - This is a good addition to the code."""
-
-
 STATIC_COMMENT_EXAMPLES = [
     {
         "comment": {
diff --git a/bugbug/tools/code_review/scorer.py b/bugbug/tools/code_review/scorer.py
new file mode 100644
index 0000000000..fac5aab475
--- /dev/null
+++ b/bugbug/tools/code_review/scorer.py
@@ -0,0 +1,285 @@
+from functools import cached_property
+from logging import getLogger
+
+import weave
+
+from bugbug.tools.comment_matching.agent import CommentMatchingTool
+from bugbug.tools.suggestion_filtering.agent import SuggestionFilteringTool
+
+logger = getLogger(__name__)
+
+
+class BasicMetricsScorer(weave.Scorer):
+    """Score basic metrics: comment counts and error tracking."""
+
+    @weave.op()
+    def score(
+        self,
+        output: dict,
+        ground_truth_comments: list[dict],
+    ) -> dict:
+        valid_comment_count = sum(
+            comment["evaluation"] == "VALID" for comment in ground_truth_comments
+        )
+        invalid_comment_count = sum(
+            comment["evaluation"] == "INVALID" for comment in ground_truth_comments
+        )
+
+        return {
+            "generated_comment_count": len(output["comments"]),
+            "ground_truth_valid_count": valid_comment_count,
+            "ground_truth_invalid_count": invalid_comment_count,
+            "ground_truth_total_count": len(ground_truth_comments),
+            "successful": not output["error"],
+        }
+
+    def summarize(self, score_rows: list[dict]) -> dict:
+        """Aggregate scores across all examples."""
+        total_generated = sum(r["generated_comment_count"] for r in score_rows)
+        total_gt_valid = sum(r["ground_truth_valid_count"] for r in score_rows)
+        total_gt_invalid = sum(r["ground_truth_invalid_count"] for r in score_rows)
+        total_gt = sum(r["ground_truth_total_count"] for r in score_rows)
+        error_count = sum(not r["successful"] for r in score_rows)
+
+        return {
+            "total_generated_comments": total_generated,
+            "total_ground_truth_valid": total_gt_valid,
+            "total_ground_truth_invalid": total_gt_invalid,
+            "total_ground_truth": total_gt,
+            "avg_generated_per_diff": (
+                total_generated / len(score_rows) if score_rows else 0
+            ),
+            "error_rate": error_count / len(score_rows) if score_rows else 0,
+            "num_examples": len(score_rows),
+        }
+
+
+class LLMCommentMatchingScorer(weave.Scorer):
+    """Score comment matching using LLM-based semantic comparison.
+
+    This scorer uses an LLM to match generated comments against ground truth
+    comments, calculating recall and precision metrics.
+    """
+
+    @cached_property
+    def matching_tool(self):
+        return CommentMatchingTool.create()
+
+    @cached_property
+    def filtering_tool(self):
+        return SuggestionFilteringTool.create()
+
+    @weave.op()
+    def score(
+        self,
+        output: dict,
+        ground_truth_comments: list[dict],
+        diff_id: int,
+    ) -> dict:
+        generated_comments = output["comments"]
+
+        retained_indices = set(
+            self.filtering_tool.get_indices_of_retained_comments(generated_comments)
+        )
+        retained_comments = [
+            c for i, c in enumerate(generated_comments) if i in retained_indices
+        ]
+        excluded_comments = [
+            c for i, c in enumerate(generated_comments) if i not in retained_indices
+        ]
+
+        old_comments = [
+            {"id": i, "content": c["comment"], "file": c["file_path"]}
+            for i, c in enumerate(ground_truth_comments)
+        ]
+
+        new_comments = [
+            {"id": i, "content": c.comment, "file": c.file}
+            for i, c in enumerate(generated_comments)
+        ]
+
+        matches = self.matching_tool.run(
+            old_comments=old_comments, new_comments=new_comments
+        )
+
+        seen_old: set[int] = set()
+        seen_new: set[int] = set()
+        matched_valid_retained = []
+        matched_valid_excluded = []
+        matched_invalid_retained = []
+        matched_invalid_excluded = []
+
+        for match in matches:
+            old_idx = match.old_comment_id
+            new_idx = match.new_comment_id
+
+            if old_idx >= len(ground_truth_comments) or new_idx >= len(
+                generated_comments
+            ):
+                continue
+
+            # Validate file match
+            gt_comment = ground_truth_comments[old_idx]
+            gen_comment = generated_comments[new_idx]
+
+            if gt_comment["file_path"] != gen_comment.file:
+                logger.debug(
+                    f"File mismatch for diff {diff_id}: "
+                    f"{gt_comment['file_path']} != {gen_comment.file}"
+                )
+                continue
+
+            seen_old.add(old_idx)
+            seen_new.add(new_idx)
+
+            is_retained = new_idx in retained_indices
+            match_comments = {
+                "ground_truth_comment": gt_comment,
+                "generated_comment": gen_comment,
+            }
+
+            if gt_comment["evaluation"] == "VALID":
+                if is_retained:
+                    matched_valid_retained.append(match_comments)
+                else:
+                    matched_valid_excluded.append(match_comments)
+            else:
+                if is_retained:
+                    matched_invalid_retained.append(match_comments)
+                else:
+                    matched_invalid_excluded.append(match_comments)
+
+        unmatched_gt_valid = []
+        unmatched_gt_invalid = []
+
+        for i in range(len(ground_truth_comments)):
+            if i in seen_old:
+                continue
+
+            comment = ground_truth_comments[i]
+            evaluation = ground_truth_comments[i]["evaluation"]
+            if evaluation == "VALID":
+                unmatched_gt_valid.append(comment)
+            else:
+                unmatched_gt_invalid.append(comment)
+
+        unmatched_gen_retained = []
+        unmatched_gen_excluded = []
+
+        for i in range(len(generated_comments)):
+            if i in seen_new:
+                continue
+
+            comment = new_comments[i]
+            if i in retained_indices:
+                unmatched_gen_retained.append(comment)
+            else:
+                unmatched_gen_excluded.append(comment)
+
+        return {
+            # Matched counts (derived from lists)
+            "matched_valid_count": len(matched_valid_retained)
+            + len(matched_valid_excluded),
+            "matched_invalid_count": len(matched_invalid_retained)
+            + len(matched_invalid_excluded),
+            # Unmatched counts
+            "unmatched_generated_count": len(unmatched_gen_retained)
+            + len(unmatched_gen_excluded),
+            "unmatched_ground_truth_valid_count": len(unmatched_gt_valid),
+            "unmatched_ground_truth_invalid_count": len(unmatched_gt_invalid),
+            # Unmatched details
+            "unmatched_ground_truth_valid": unmatched_gt_valid,
+            "unmatched_ground_truth_invalid": unmatched_gt_invalid,
+            "unmatched_gen_retained": unmatched_gen_retained,
+            "unmatched_gen_excluded": unmatched_gen_excluded,
+            # Filtering metrics
+            "filtering_retained_count": len(retained_comments),
+            "filtering_excluded_count": len(excluded_comments),
+            "filtering_retention_rate": (
+                len(retained_comments) / len(generated_comments)
+                if generated_comments
+                else 0
+            ),
+            "filtering_retained_comments": retained_comments,
+            "filtering_excluded_comments": excluded_comments,
+            # Filtering x Matching breakdown (lists with details)
+            "matched_valid_retained": matched_valid_retained,
+            "matched_valid_excluded": matched_valid_excluded,
+            "matched_invalid_retained": matched_invalid_retained,
+            "matched_invalid_excluded": matched_invalid_excluded,
+        }
+
+    def summarize(self, score_rows: list[dict]) -> dict:
+        total_matched_valid = sum(r["matched_valid_count"] for r in score_rows)
+        total_matched_invalid = sum(r["matched_invalid_count"] for r in score_rows)
+        total_unmatched_gen = sum(r["unmatched_generated_count"] for r in score_rows)
+        total_unmatched_gt_valid = sum(
+            r["unmatched_ground_truth_valid_count"] for r in score_rows
+        )
+        total_unmatched_gt_invalid = sum(
+            r["unmatched_ground_truth_invalid_count"] for r in score_rows
+        )
+
+        total_gt_valid = total_matched_valid + total_unmatched_gt_valid
+        total_gt_invalid = total_matched_invalid + total_unmatched_gt_invalid
+
+        # Filtering aggregates
+        total_retained = sum(r["filtering_retained_count"] for r in score_rows)
+        total_excluded = sum(r["filtering_excluded_count"] for r in score_rows)
+        total_generated = total_retained + total_excluded
+
+        # Filtering x Matching aggregates (use len() since values are lists)
+        total_matched_valid_retained = sum(
+            len(r["matched_valid_retained"]) for r in score_rows
+        )
+        total_matched_valid_excluded = sum(
+            len(r["matched_valid_excluded"]) for r in score_rows
+        )
+        total_matched_invalid_retained = sum(
+            len(r["matched_invalid_retained"]) for r in score_rows
+        )
+        total_matched_invalid_excluded = sum(
+            len(r["matched_invalid_excluded"]) for r in score_rows
+        )
+
+        return {
+            "total_matched_valid": total_matched_valid,
+            "total_matched_invalid": total_matched_invalid,
+            "total_unmatched_generated": total_unmatched_gen,
+            "recall_valid": (
+                total_matched_valid / total_gt_valid if total_gt_valid > 0 else 0
+            ),
+            "recall_invalid": (
+                total_matched_invalid / total_gt_invalid if total_gt_invalid > 0 else 0
+            ),
+            "missed_valid_rate": (
+                total_unmatched_gt_valid / total_gt_valid if total_gt_valid > 0 else 0
+            ),
+            "missed_invalid_rate": (
+                total_unmatched_gt_invalid / total_gt_invalid
+                if total_gt_invalid > 0
+                else 0
+            ),
+            # Filtering summary metrics
+            "total_filtering_retained": total_retained,
+            "total_filtering_excluded": total_excluded,
+            "overall_retention_rate": (
+                total_retained / total_generated if total_generated > 0 else 0
+            ),
+            # Filtering x Matching summary
+            "total_matched_valid_retained": total_matched_valid_retained,
+            "total_matched_valid_excluded": total_matched_valid_excluded,
+            "total_matched_invalid_retained": total_matched_invalid_retained,
+            "total_matched_invalid_excluded": total_matched_invalid_excluded,
+            # Derived rates
+            "false_exclusion_rate": (
+                total_matched_valid_excluded / total_matched_valid
+                if total_matched_valid > 0
+                else 0
+            ),
+            "true_exclusion_rate": (
+                total_matched_invalid_excluded / total_matched_invalid
+                if total_matched_invalid > 0
+                else 0
+            ),
+        }
diff --git a/bugbug/tools/code_review/utils.py b/bugbug/tools/code_review/utils.py
index ce5c5e85dc..9bc648e440 100644
--- a/bugbug/tools/code_review/utils.py
+++ b/bugbug/tools/code_review/utils.py
@@ -4,7 +4,6 @@
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-import json
 import re
 from itertools import chain
 from logging import getLogger
@@ -343,37 +342,24 @@ def get_structured_functions(target, functions_declaration):
     return function_declaration_text
 
 
-def parse_model_output(output: str) -> list[dict]:
-    # TODO: This function should raise an exception when the output is not valid
-    # JSON. The caller can then decide how to handle the error.
-    # For now, we just log the error and return an empty list.
+def convert_generated_comments_to_inline(
+    comments, patch: PatchSet
+) -> Iterable[InlineComment]:
+    """Convert GeneratedReviewComment objects to InlineComment objects.
 
-    json_opening_tag = output.find("```json")
-    if json_opening_tag != -1:
-        json_closing_tag = output.rfind("```")
-        if json_closing_tag == -1:
-            logger.error("No closing ``` found for JSON in model output: %s", output)
-            return []
-        output = output[json_opening_tag + len("```json") : json_closing_tag]
-
-    try:
-        comments = json.loads(output)
-    except json.JSONDecodeError:
-        logger.error("Failed to parse JSON from model output: %s", output)
-        return []
-
-    return comments
-
-
-def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineComment]:
-    comments = parse_model_output(output)
+    Args:
+        comments: List of GeneratedReviewComment objects.
+        patch: The PatchSet to validate file paths against.
 
+    Yields:
+        InlineComment objects with proper scope information.
+    """
     patched_files_map = {
         patched_file.target_file: patched_file for patched_file in patch
     }
 
     for comment in comments:
-        file_path = comment["file"]
+        file_path = comment.file
         if not file_path.startswith("b/") and not file_path.startswith("a/"):
             file_path = "b/" + file_path
 
@@ -385,7 +371,7 @@ def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineCo
                 f"The file `{file_path}` is not part of the patch: {list(patched_files_map)}"
             )
 
-        line_number = comment["code_line"]
+        line_number = comment.code_line
         if not isinstance(line_number, int):
             raise ModelResultError("Line number must be an integer")
 
@@ -401,8 +387,8 @@ def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineCo
             end_line=line_number,
             hunk_start_line=scope["line_start"],
             hunk_end_line=scope["line_end"],
-            content=comment["comment"],
+            content=comment.comment,
             on_removed_code=not scope["has_added_lines"],
-            explanation=comment["explanation"],
-            order=comment["order"],
+            explanation=comment.explanation,
+            order=comment.order,
         )
diff --git a/bugbug/tools/comment_matching/agent.py b/bugbug/tools/comment_matching/agent.py
new file mode 100644
index 0000000000..2da194a5b4
--- /dev/null
+++ b/bugbug/tools/comment_matching/agent.py
@@ -0,0 +1,62 @@
+from langchain.agents import create_agent
+from langchain.agents.structured_output import ProviderStrategy
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+from pydantic import BaseModel, Field
+
+from bugbug.tools.comment_matching.prompts import FIRST_MESSAGE_TEMPLATE, SYSTEM_PROMPT
+
+
+class MatchingComment(BaseModel):
+    id: int = Field(description="Unique identifier for the comment")
+    content: str = Field(description="Content of the code review comment")
+    file: str = Field(description="File path of the comment")
+
+
+class CommentMatch(BaseModel):
+    old_comment_id: int = Field(description="ID of the comment from the old set")
+    new_comment_id: int = Field(description="ID of the comment from the new set")
+
+
+class AgentResponse(BaseModel):
+    matches: list[CommentMatch] = Field(
+        description="List of matched comment pairs between old and new comments"
+    )
+
+
+class CommentMatchingTool:
+    def __init__(self, llm: BaseChatModel):
+        self.agent = create_agent(
+            llm,
+            system_prompt=SYSTEM_PROMPT,
+            response_format=ProviderStrategy(AgentResponse),
+        )
+
+    def create(**kwargs):
+        if "llm" not in kwargs:
+            from bugbug.tools.core.llms import create_openai_llm
+
+            kwargs["llm"] = create_openai_llm()
+
+        return CommentMatchingTool(**kwargs)
+
+    def run(
+        self, old_comments: list[MatchingComment], new_comments: list[MatchingComment]
+    ) -> list[CommentMatch]:
+        if not old_comments or not new_comments:
+            return []
+
+        response = self.agent.invoke(
+            {
+                "messages": [
+                    HumanMessage(
+                        FIRST_MESSAGE_TEMPLATE.format(
+                            old_comments=old_comments,
+                            new_comments=new_comments,
+                        )
+                    )
+                ]
+            }
+        )
+
+        return response["structured_response"].matches
diff --git a/bugbug/tools/comment_matching/prompts.py b/bugbug/tools/comment_matching/prompts.py
new file mode 100644
index 0000000000..8be398afc1
--- /dev/null
+++ b/bugbug/tools/comment_matching/prompts.py
@@ -0,0 +1,29 @@
+SYSTEM_PROMPT = """You are an expert in code review at Mozilla Firefox.
+
+**Task**:
+
+Match two sets of code review comments to identify redundant comments.
+
+**Instructions**:
+
+    1. **Consider the following about all comments**:
+        - The comments are related to the same code patch.
+        - The comments may be written in different styles.
+
+    2. **Understand what each comment is addressing**:
+        - Read the comments in both sets.
+        - Understand the issue that each comment is addressing.
+
+    3. **Check for matches**:
+        - If you find a comment in the old set that is addressing the same issue as a comment in the new set, link them as redundant.
+        - The comments may not be identical, but they should be addressing the same issue.
+        - The level of detail in the comments may vary.
+"""
+
+FIRST_MESSAGE_TEMPLATE = """**First set of comments (old comments / ground truth)**:
+
+{old_comments}
+
+**Second set of comments (new comments / generated)**:
+
+{new_comments}"""
diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py
index c63102d780..0805e3b25a 100644
--- a/bugbug/tools/core/platforms/base.py
+++ b/bugbug/tools/core/platforms/base.py
@@ -79,7 +79,7 @@ def get_review_request_by_id(self, review_id: int):
         raise NotImplementedError
 
     @abstractmethod
-    def get_patch_by_id(self, patch_id: str) -> Patch:
+    def get_patch_by_id(self, patch_id: str | int) -> Patch:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py
index 8f0defdd57..4122a60060 100644
--- a/bugbug/tools/core/platforms/phabricator.py
+++ b/bugbug/tools/core/platforms/phabricator.py
@@ -497,7 +497,7 @@ def get_review_request_by_id(self, revision_id: int) -> ReviewRequest:
         wait=tenacity.wait_exponential(multiplier=2, min=2),
         reraise=True,
     )
-    def get_patch_by_id(self, patch_id: str) -> Patch:
+    def get_patch_by_id(self, patch_id: str | int) -> Patch:
         return PhabricatorPatch(patch_id)
 
     def get_all_inline_comments(
diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py
index 0fbf234d96..13cca92fda 100644
--- a/bugbug/tools/core/platforms/swarm.py
+++ b/bugbug/tools/core/platforms/swarm.py
@@ -81,8 +81,8 @@ def __init__(self):
     def get_review_request_by_id(self, revision_id: int) -> ReviewRequest:
         return ReviewRequest(revision_id)
 
-    def get_patch_by_id(self, patch_id: str) -> Patch:
-        return SwarmPatch(patch_id, self.auth)
+    def get_patch_by_id(self, patch_id: str | int) -> Patch:
+        return SwarmPatch(str(patch_id), self.auth)
 
     def get_all_inline_comments(
         self, comment_filter
diff --git a/bugbug/tools/patch_summarization/agent.py b/bugbug/tools/patch_summarization/agent.py
new file mode 100644
index 0000000000..9e09935f25
--- /dev/null
+++ b/bugbug/tools/patch_summarization/agent.py
@@ -0,0 +1,48 @@
+from langchain.agents import create_agent
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+
+from bugbug.tools.base import GenerativeModelTool
+from bugbug.tools.code_review.utils import format_patch_set
+from bugbug.tools.core.platforms.base import Patch
+from bugbug.tools.patch_summarization.prompts import PROMPT_TEMPLATE_SUMMARIZATION
+
+
+class PatchSummarizationTool(GenerativeModelTool):
+    def __init__(self, llm: BaseChatModel, target_software: str = "Mozilla Firefox"):
+        self.target_software = target_software
+        self.agent = create_agent(llm)
+
+    def run(self, patch: Patch):
+        result = self.agent.invoke(
+            {
+                "messages": [
+                    HumanMessage(
+                        PROMPT_TEMPLATE_SUMMARIZATION.format(
+                            target_software=self.target_software,
+                            patch=format_patch_set(patch.patch_set),
+                            bug_title=patch.bug_title,
+                            patch_title=patch.patch_title,
+                            patch_description=patch.patch_description,
+                        )
+                    )
+                ]
+            }
+        )
+
+        return result["messages"][-1].content
+
+    @staticmethod
+    def create(**kwargs):
+        """Factory method to instantiate the tool with default dependencies.
+
+        This method takes the same parameters as the constructor, but all
+        parameters are optional. If a parameter is not provided, a default
+        component will be created and used.
+        """
+        if "llm" not in kwargs:
+            from bugbug.tools.core.llms import create_anthropic_llm
+
+            kwargs["llm"] = create_anthropic_llm()
+
+        return PatchSummarizationTool(**kwargs)
diff --git a/bugbug/tools/patch_summarization/prompts.py b/bugbug/tools/patch_summarization/prompts.py
new file mode 100644
index 0000000000..fa0e167c18
--- /dev/null
+++ b/bugbug/tools/patch_summarization/prompts.py
@@ -0,0 +1,25 @@
+PROMPT_TEMPLATE_SUMMARIZATION = """You are an expert reviewer for {target_software}, with experience on source code reviews.
+
+Please, analyze the code provided and report a summarization about the new changes; for that, focus on the coded added represented by lines that start with "+".
+
+The summarization should have two parts:
+    1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request.
+    2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent.
+
+Do not include any code in the summarization, only a description of the changes.
+
+**Bug title**:
+<bug_title>
+{bug_title}
+</bug_title>
+
+**Commit message**:
+<commit_message>
+{patch_title}
+{patch_description}
+</commit_message>
+
+**Diff**:
+<patch>
+{patch}
+</patch>"""
diff --git a/bugbug/tools/suggestion_filtering/agent.py b/bugbug/tools/suggestion_filtering/agent.py
new file mode 100644
index 0000000000..55a0623e0c
--- /dev/null
+++ b/bugbug/tools/suggestion_filtering/agent.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Suggestion filtering tool implementation."""
+
+from typing import Iterable, Optional
+
+from langchain.agents import create_agent
+from langchain.agents.structured_output import ProviderStrategy
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+from pydantic import BaseModel, Field
+
+from bugbug.tools.base import GenerativeModelTool
+from bugbug.tools.code_review.agent import GeneratedReviewComment
+from bugbug.tools.code_review.database import SuggestionsFeedbackDB
+from bugbug.tools.suggestion_filtering.prompts import (
+    DEFAULT_REJECTED_EXAMPLES,
+    PROMPT_TEMPLATE_FILTERING_ANALYSIS,
+)
+
+
+class FilteredComments(BaseModel):
+    """The response from the filtering agent."""
+
+    comment_indices: list[int] = Field(
+        description="A list of indices of the comments that were kept after filtering"
+    )
+
+
+class SuggestionFilteringTool(GenerativeModelTool):
+    """Tool for filtering generated review comments.
+
+    Filters out low-quality suggestions using an LLM agent with
+    optional dynamic rejected examples from a feedback database.
+    """
+
+    def __init__(
+        self,
+        llm: BaseChatModel,
+        target_software: str = "Mozilla Firefox",
+        suggestions_feedback_db: Optional[SuggestionsFeedbackDB] = None,
+    ) -> None:
+        self.target_software = target_software
+        self.suggestions_feedback_db = suggestions_feedback_db
+        self.agent = create_agent(
+            llm,
+            response_format=ProviderStrategy(FilteredComments),
+        )
+
+    def get_indices_of_retained_comments(
+        self, suggestions: list[GeneratedReviewComment]
+    ) -> list[int]:
+        """Get indices of comments to keep after filtering.
+
+        Args:
+            suggestions: List of generated review comments to filter.
+
+        Returns:
+            List of indices of comments to keep.
+        """
+        if not suggestions:
+            return []
+
+        rejected_examples = self._get_rejected_examples(suggestions)
+
+        formatted_comments = "\n".join(
+            f"Index {i}: {comment.model_dump()}"
+            for i, comment in enumerate(suggestions)
+        )
+
+        prompt = PROMPT_TEMPLATE_FILTERING_ANALYSIS.format(
+            target_code_consistency=self.target_software,
+            comments=formatted_comments,
+            rejected_examples=rejected_examples,
+        )
+
+        result = self.agent.invoke({"messages": [HumanMessage(prompt)]})
+
+        return result["structured_response"].comment_indices
+
+    def run(
+        self, suggestions: list[GeneratedReviewComment]
+    ) -> list[GeneratedReviewComment]:
+        """Filter the given suggestions and return filtered comments.
+
+        Args:
+            suggestions: List of generated review comments to filter.
+
+        Returns:
+            List of filtered GeneratedReviewComment objects.
+        """
+        return [
+            suggestions[i] for i in self.get_indices_of_retained_comments(suggestions)
+        ]
+
+    def _get_rejected_examples(self, suggestions: list[GeneratedReviewComment]) -> str:
+        """Get rejected examples for filtering.
+
+        Uses dynamic examples from feedback database if available,
+        otherwise falls back to default static examples.
+        """
+        if not self.suggestions_feedback_db:
+            return DEFAULT_REJECTED_EXAMPLES
+
+        rejected_comments = list(self._get_similar_rejected_comments(suggestions))
+        if not rejected_comments:
+            return DEFAULT_REJECTED_EXAMPLES
+
+        return "\n    - ".join(rejected_comments)
+
+    def _get_similar_rejected_comments(
+        self, suggestions: list[GeneratedReviewComment]
+    ) -> Iterable[str]:
+        """Find similar rejected comments from the feedback database."""
+        if not self.suggestions_feedback_db:
+            raise Exception("Suggestions feedback database is not available")
+
+        num_examples_per_suggestion = 10 // len(suggestions) or 1
+        seen_ids: set[int] = set()
+
+        for suggestion in suggestions:
+            similar_rejected_suggestions = (
+                self.suggestions_feedback_db.find_similar_rejected_suggestions(
+                    suggestion.comment,
+                    limit=num_examples_per_suggestion,
+                    excluded_ids=seen_ids,
+                )
+            )
+            for rejected_suggestion in similar_rejected_suggestions:
+                seen_ids.add(rejected_suggestion.id)
+                yield rejected_suggestion.comment
+
+    @staticmethod
+    def create(**kwargs):
+        """Factory method to instantiate the tool with default dependencies.
+
+        This method takes the same parameters as the constructor, but all
+        parameters are optional. If a parameter is not provided, a default
+        component will be created and used.
+        """
+        if "suggestions_feedback_db" not in kwargs:
+            from bugbug.vectordb import QdrantVectorDB
+
+            kwargs["suggestions_feedback_db"] = SuggestionsFeedbackDB(
+                QdrantVectorDB("suggestions_feedback")
+            )
+
+        if "llm" not in kwargs:
+            from bugbug.tools.core.llms import create_anthropic_llm
+
+            kwargs["llm"] = create_anthropic_llm()
+
+        return SuggestionFilteringTool(**kwargs)
diff --git a/bugbug/tools/suggestion_filtering/prompts.py b/bugbug/tools/suggestion_filtering/prompts.py
new file mode 100644
index 0000000000..8478e3ac17
--- /dev/null
+++ b/bugbug/tools/suggestion_filtering/prompts.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Prompt templates for suggestion filtering."""
+
+PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that:
+- are consistent with the {target_code_consistency} source code;
+- focus on reporting possible bugs, functional regressions, issues, or similar concerns;
+- report readability or design concerns.
+
+Exclude comments that:
+- only describe the change;
+- restate obvious facts like renamed variables or replaced code;
+- include praising;
+- ask if changes are intentional or ask to ensure things exist.
+
+Comments:
+<comments-to-filter>
+{comments}
+</comments-to-filter>
+
+As examples of not expected comments, not related to the current patch, please, check some below:
+<rejected-examples>
+    - {rejected_examples}
+</rejected-examples>
+"""
+
+
+DEFAULT_REJECTED_EXAMPLES = """Please note that these are minor improvements and the overall quality of the patch is good. The documentation is being expanded in a clear and structured way, which will likely be beneficial for future development.
+    - Please note that these are just suggestions and the code might work perfectly fine as it is. It's always a good idea to test all changes thoroughly to ensure they work as expected.
+    - Overall, the patch seems to be well implemented with no major concerns. The developers have made a conscious decision to align with Chrome's behavior, and the reasoning is well documented.
+    - There are no complex code changes in this patch, so there's no potential for major readability regressions or bugs introduced by the changes.
+    - The `focus(...)` method is called without checking if the element and its associated parameters exist or not. It would be better to check if the element exists before calling the `focus()` method to avoid potential errors.
+    - It's not clear if the `SearchService.sys.mjs` file exists or not. If it doesn't exist, this could cause an error. Please ensure that the file path is correct.
+    - This is a good addition to the code."""
diff --git a/experiments/review_helper_modify_filtering_step.ipy b/experiments/review_helper_modify_filtering_step.ipy
deleted file mode 100644
index 26d764a93f..0000000000
--- a/experiments/review_helper_modify_filtering_step.ipy
+++ /dev/null
@@ -1,306 +0,0 @@
-# %%
-%pip install -q bugbug==0.0.570
-
-# %%
-from bugbug import utils
-from bugbug.code_search.mozilla import FunctionSearchMozilla
-from bugbug.tools.code_review.utils import generate_processed_output, parse_model_output
-
-
-def get_file(commit_hash, path):
-    r = utils.get_session("hgmo").get(
-        f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}",
-        headers={
-            "User-Agent": utils.get_user_agent(),
-        },
-    )
-    r.raise_for_status()
-    return r.text
-
-
-repo_dir = "../mozilla-unified"
-function_search = FunctionSearchMozilla(repo_dir, get_file, True)
-
-
-# %%
-from dotenv import load_dotenv
-
-from bugbug.generative_model_tool import create_openai_llm
-
-load_dotenv("~/.env")
-
-llm = create_openai_llm()
-
-# %%
-from bugbug.tools import code_review
-from bugbug.vectordb import QdrantVectorDB
-
-review_comments_db = code_review.ReviewCommentsDB(QdrantVectorDB("diff_comments"))
-
-
-suggestions_feedback_db = code_review.SuggestionsFeedbackDB(
-    QdrantVectorDB("suggestions_feedback")
-)
-
-review_platform = "phabricator"
-review_data: code_review.ReviewData = code_review.review_data_classes[review_platform]()
-
-
-# %%
-tool = code_review.CodeReviewTool(
-    llm=llm,
-    # function_search=function_search,
-    review_comments_db=review_comments_db,
-    suggestions_feedback_db=suggestions_feedback_db,
-    verbose=False,
-)
-
-
-# %%
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-
-partial_variables = {
-    "target_code_consistency": "Mozilla Firefox",
-}
-
-current_filtering_chain = LLMChain(
-    prompt=PromptTemplate.from_template(
-        code_review.PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-        partial_variables=partial_variables,
-    ),
-    llm=llm,
-)
-
-
-# %%
-from scripts.code_review_tool_evaluator import (
-    FeedbackEvaluator,
-)
-
-evaluator = FeedbackEvaluator("../bugbug_evaluation_datasets/evaluated_comments.csv")
-
-
-# %%
-import pandas as pd
-
-
-def get_latest_evaluation_results_file():
-    import glob
-
-    files = glob.glob("evaluation_results_*#*.csv")
-    if not files:
-        raise FileNotFoundError(
-            "No evaluation results file found. The file name should have `#` followed by a PR number."
-        )
-
-    return max(files)
-
-
-# SELECTION (Option 1: same as before)
-selected_review_requests = [
-    (revision_id, code_review.ReviewRequest(diff_id))
-    for revision_id, diff_id in pd.read_csv(get_latest_evaluation_results_file())[
-        ["revision_id", "diff_id"]
-    ]
-    .drop_duplicates()
-    .itertuples(name=None, index=False)
-]
-
-# SELECTION (Option 2: random sample)
-# selected_review_requests = [
-#     (revision_id, code_review.ReviewRequest(diff_id))
-#     for revision_id, diff_id in evaluator.evaluated_comments
-#     # .query("evaluation == 'CORRECT'")
-#     [["revision_id", "diff_id"]].drop_duplicates().sample(100).itertuples(index=False)
-# ]
-
-
-# %%
-
-
-def common_step():
-    for review_request_id, review_request in selected_review_requests:
-        print("---------------------------------------------------------")
-        print(f"Review Request ID: {review_request_id}")
-        print(f"Patch ID: {review_request.patch_id}")
-        patch = review_data.get_patch_by_id(review_request.patch_id)
-        print("---------------------------------------------------------")
-
-        try:
-            output = tool._generate_suggestions(patch)
-        except code_review.FileNotInPatchError as e:
-            print("Error while running the tool:", e)
-            continue
-        except code_review.LargeDiffError:
-            print("Skipping the patch because it is too large.")
-            continue
-
-        unfiltered_suggestions = parse_model_output(output)
-        if not unfiltered_suggestions:
-            print("No suggestions were generated")
-            continue
-
-        rejected_examples = (
-            "\n    - ".join(tool.get_similar_rejected_comments(unfiltered_suggestions))
-            if tool.suggestions_feedback_db
-            else code_review.DEFAULT_REJECTED_EXAMPLES
-        )
-
-        raw_output = current_filtering_chain.invoke(
-            {
-                "review": output,
-                "patch": patch.raw_diff,
-                "rejected_examples": rejected_examples,
-            },
-            return_only_outputs=True,
-        )["text"]
-
-        try:
-            comments = list(
-                generate_processed_output(raw_output, patch.patch_set)
-            )
-        except code_review.FileNotInPatchError as e:
-            print("Error while running the tool:", e)
-            continue
-
-        # print_prettified_comments(comments)
-        evaluation = evaluator.evaluate_diff_comments(review_request.patch_id, comments)
-        evaluation_for_original = [
-            {
-                "variant_name": "Current Filtering",
-                "revision_id": review_request_id,
-                "diff_id": review_request.patch_id,
-                **row,
-            }
-            for row in evaluation
-        ]
-
-        yield (
-            rejected_examples,
-            output,
-            patch,
-            review_request_id,
-            review_request,
-            evaluation_for_original,
-        )
-
-
-common_step_results = list(common_step())
-
-# %%
-import pickle
-
-if "common_step_results" not in locals():
-    raise ValueError(
-        "common_step_results is not defined. Please run the common_step() function first."
-    )
-
-
-with open("common_step_results.pkl", "wb") as f:
-    pickle.dump(common_step_results, f)
-
-# %%
-import pickle
-
-with open("common_step_results.pkl", "rb") as f:
-    common_step_results = pickle.load(f)
-
-# %%
-from datetime import datetime
-
-NEW__PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that:
-- are consistent with the {target_code_consistency} source code;
-- focus on reporting possible bugs, functional regressions, issues, or similar concerns;
-- report readability or design concerns.
-
-Exclude comments that:
-- only describe the change;
-- restate obvious facts like renamed variables or replaced code;
-- include praising;
-- ask if changes are intentional or ask to ensure things exist.
-
-Do not report any explanation about your choice. Only return a valid JSON list.
-
-Comments:
-{comments}
-
-As examples of not expected comments, not related to the current patch, please, check some below:
-    - {rejected_examples}
-"""
-
-new_filtering_chain = LLMChain(
-    prompt=PromptTemplate.from_template(
-        NEW__PROMPT_TEMPLATE_FILTERING_ANALYSIS,
-        partial_variables=partial_variables,
-    ),
-    llm=llm,
-)
-
-
-all_variants_evaluation_results = []
-
-for (
-    rejected_examples,
-    output,
-    patch,
-    review_request_id,
-    review_request,
-    evaluation_for_original,
-) in common_step_results:
-
-    all_variants_evaluation_results.extend(evaluation_for_original)
-
-    raw_output = new_filtering_chain.invoke(
-        {
-            "comments": output,
-            "rejected_examples": rejected_examples,
-        },
-        return_only_outputs=True,
-    )["text"]
-
-    comments = list(generate_processed_output(raw_output, patch.patch_set))
-
-    # print_prettified_comments(comments)
-    evaluation = evaluator.evaluate_diff_comments(review_request.patch_id, comments)
-    all_variants_evaluation_results.extend(
-        {
-            "variant_name": "New Filtering",
-            "revision_id": review_request_id,
-            "diff_id": review_request.patch_id,
-            **row,
-        }
-        for row in evaluation
-    )
-
-    evaluation_result_all_columns = [
-        "variant_name",
-        "revision_id",
-        "diff_id",
-        "new_comment",
-        "old_comments_count",
-        "matched",
-        "old_comment",
-        "evaluation",
-    ]
-
-
-evaluation_results_file = (
-    f"evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
-)
-
-
-df = pd.DataFrame(
-    all_variants_evaluation_results, columns=evaluation_result_all_columns
-)
-df.to_csv(
-    evaluation_results_file,
-    index=False,
-    header=True,
-    mode="w",
-)
-
-
-# %%
-print("Done!")
-# %%
diff --git a/notebooks/code_review_create_dataset.ipynb b/notebooks/code_review_create_dataset.ipynb
new file mode 100644
index 0000000000..96cdf91e42
--- /dev/null
+++ b/notebooks/code_review_create_dataset.ipynb
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "af4f2030",
+   "metadata": {},
+   "source": [
+    "# Create Dataset for Code Review Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0f37fd7",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f57608e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import weave\n",
+    "\n",
+    "PROJECT_NAME = \"bugbug-code-review-eval\"\n",
+    "DATASET_NAME = \"code_review_eval\"\n",
+    "\n",
+    "_ = weave.init(PROJECT_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "255ccaac",
+   "metadata": {},
+   "source": [
+    "## Prepare the Data\n",
+    "\n",
+    "### Load the feedback data:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bd97ef4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv(\n",
+    "    \"https://docs.google.com/spreadsheets/d/e/2PACX-1vTQLbFpQmzEv4zxaKJprosIt6RWs4jDAN6IDT6xB7EcelD0ZR8qilt2tpscUSGEwMNWVpHKDlPacA7b/pub?output=csv\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6f4f74a",
+   "metadata": {},
+   "source": [
+    "### Reuse summaries from the last dataset version:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb3fee95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summaries = {\n",
+    "    row[\"diff_id\"]: row[\"patch_summary\"]\n",
+    "    for row in weave.ref(DATASET_NAME).get()\n",
+    "    if row.get(\"patch_summary\")\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6e93f09e",
+   "metadata": {},
+   "source": [
+    "## Generate Summaries\n",
+    "\n",
+    "Generate patch summaries for new diffs that do not have summaries yet."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03accee8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData\n",
+    "from bugbug.tools.patch_summarization.agent import PatchSummarizationTool\n",
+    "\n",
+    "summarizer = PatchSummarizationTool.create()\n",
+    "review_data = PhabricatorReviewData()\n",
+    "\n",
+    "\n",
+    "@weave.op()\n",
+    "def summarize(diff_id):\n",
+    "    patch = review_data.get_patch_by_id(diff_id)\n",
+    "    summaries[diff_id] = summarizer.run(patch)\n",
+    "\n",
+    "\n",
+    "@weave.op()\n",
+    "def generate_summaries(diff_ids):\n",
+    "    with weave.ThreadPoolExecutor() as exc:\n",
+    "        exc.map(summarize, diff_ids)\n",
+    "\n",
+    "\n",
+    "diff_ids_with_no_summaries = (\n",
+    "    df.query(\"diff_id not in @summaries\")[\"diff_id\"].unique().tolist()\n",
+    ")\n",
+    "\n",
+    "generate_summaries(diff_ids_with_no_summaries)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c4f93a4",
+   "metadata": {},
+   "source": [
+    "## Save the Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5041136",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = [\n",
+    "    {\n",
+    "        \"diff_id\": diff_id,\n",
+    "        \"revision_id\": revision_id,\n",
+    "        \"patch_summary\": summaries[diff_id],\n",
+    "        \"ground_truth_comments\": ground_truth_comments,\n",
+    "    }\n",
+    "    for (diff_id, revision_id), ground_truth_comments in df.groupby(\n",
+    "        [\"diff_id\", \"revision_id\"]\n",
+    "    )\n",
+    "    .apply(lambda x: x.to_dict(orient=\"records\"))\n",
+    "    .to_dict()\n",
+    "    .items()\n",
+    "    if diff_id in summaries\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d132783a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = weave.Dataset(\n",
+    "    name=DATASET_NAME,\n",
+    "    description=\"Code review evaluation dataset with ground truth comments and patch summaries.\",\n",
+    "    rows=examples,\n",
+    ")\n",
+    "\n",
+    "_ = weave.publish(dataset)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bugbug (3.12.7)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/code_review_evaluation.ipynb b/notebooks/code_review_evaluation.ipynb
new file mode 100644
index 0000000000..3291d491e8
--- /dev/null
+++ b/notebooks/code_review_evaluation.ipynb
@@ -0,0 +1,234 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Code Review Tool Evaluation\n",
+    "\n",
+    "This notebook runs W&B Weave evaluations for the code review tool."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import weave\n",
+    "\n",
+    "PROJECT_NAME = \"bugbug-code-review-eval\"\n",
+    "\n",
+    "_ = weave.init(PROJECT_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = weave.ref(\"code_review_eval_legacy\").get()\n",
+    "\n",
+    "print(f\"Dataset has {len(dataset.rows)} examples\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configure Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functools import cached_property\n",
+    "\n",
+    "from bugbug.tools.code_review.agent import CodeReviewTool\n",
+    "\n",
+    "\n",
+    "class CodeReviewModel(weave.Model):\n",
+    "    \"\"\"Weave Model wrapper for CodeReviewTool.\"\"\"\n",
+    "\n",
+    "    @cached_property\n",
+    "    def tool(self):\n",
+    "        return CodeReviewTool.create()\n",
+    "\n",
+    "    @weave.op()\n",
+    "    def invoke(self, diff_id: int, patch_summary: str) -> dict:\n",
+    "        patch = self.tool.review_data.get_patch_by_id(diff_id)\n",
+    "        try:\n",
+    "            comments = self.tool.generate_review_comments(patch, patch_summary)\n",
+    "            return {\n",
+    "                \"comments\": comments,\n",
+    "                \"error\": None,\n",
+    "            }\n",
+    "        except Exception as e:\n",
+    "            return {\n",
+    "                \"comments\": [],\n",
+    "                \"error\": str(e),\n",
+    "            }\n",
+    "\n",
+    "\n",
+    "model = CodeReviewModel()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from bugbug.tools.code_review.scorer import BasicMetricsScorer, LLMCommentMatchingScorer\n",
+    "\n",
+    "evaluation = weave.Evaluation(\n",
+    "    dataset=dataset,\n",
+    "    scorers=[BasicMetricsScorer(), LLMCommentMatchingScorer()],\n",
+    ")\n",
+    "\n",
+    "results = await evaluation.evaluate(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualizations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Extract key metrics for visualization\n",
+    "metrics = {\n",
+    "    \"Recall (Valid)\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n",
+    "        \"recall_valid\", 0\n",
+    "    ),\n",
+    "    \"Recall (Invalid)\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n",
+    "        \"recall_invalid\", 0\n",
+    "    ),\n",
+    "    \"Missed Valid Rate\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n",
+    "        \"missed_valid_rate\", 0\n",
+    "    ),\n",
+    "    \"Error Rate\": results.get(\"BasicMetricsScorer\", {}).get(\"error_rate\", 0),\n",
+    "}\n",
+    "\n",
+    "# Bar chart\n",
+    "fig, ax = plt.subplots(figsize=(10, 6))\n",
+    "bars = ax.bar(\n",
+    "    metrics.keys(), metrics.values(), color=[\"green\", \"orange\", \"red\", \"gray\"]\n",
+    ")\n",
+    "ax.set_ylabel(\"Rate\")\n",
+    "ax.set_title(\"Code Review Evaluation Metrics\")\n",
+    "ax.set_ylim(0, 1)\n",
+    "\n",
+    "# Add value labels on bars\n",
+    "for bar, value in zip(bars, metrics.values()):\n",
+    "    ax.text(\n",
+    "        bar.get_x() + bar.get_width() / 2,\n",
+    "        bar.get_height() + 0.02,\n",
+    "        f\"{value:.2%}\",\n",
+    "        ha=\"center\",\n",
+    "        va=\"bottom\",\n",
+    "    )\n",
+    "\n",
+    "plt.xticks(rotation=45, ha=\"right\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Comment counts comparison\n",
+    "basic_metrics = results.get(\"BasicMetricsScorer\", {})\n",
+    "\n",
+    "counts = {\n",
+    "    \"Generated\": basic_metrics.get(\"total_generated_comments\", 0),\n",
+    "    \"Ground Truth (Valid)\": basic_metrics.get(\"total_ground_truth_valid\", 0),\n",
+    "    \"Ground Truth (Invalid)\": basic_metrics.get(\"total_ground_truth_invalid\", 0),\n",
+    "}\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(8, 5))\n",
+    "bars = ax.bar(counts.keys(), counts.values(), color=[\"blue\", \"green\", \"red\"])\n",
+    "ax.set_ylabel(\"Count\")\n",
+    "ax.set_title(\"Comment Counts\")\n",
+    "\n",
+    "for bar, value in zip(bars, counts.values()):\n",
+    "    ax.text(\n",
+    "        bar.get_x() + bar.get_width() / 2,\n",
+    "        bar.get_height() + 0.5,\n",
+    "        str(int(value)),\n",
+    "        ha=\"center\",\n",
+    "        va=\"bottom\",\n",
+    "    )\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. View in W&B\n",
+    "\n",
+    "Visit [W&B Weave](https://wandb.ai) to see detailed traces, compare evaluations, and explore individual predictions."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "bugbug (3.12.7)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/requirements.txt b/requirements.txt
index c3d5603aac..9e6870eb74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -41,5 +41,6 @@ taskcluster==94.1.1
 tenacity==9.1.2
 tqdm==4.67.1
 unidiff==0.7.5
+weave>=0.50.0
 xgboost==3.1.2
 zstandard==0.25.0
diff --git a/scripts/code_review_tool_evaluator.py b/scripts/code_review_tool_evaluator.py
deleted file mode 100644
index 085035d834..0000000000
--- a/scripts/code_review_tool_evaluator.py
+++ /dev/null
@@ -1,613 +0,0 @@
-# -*- coding: utf-8 -*-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-"""This script evaluates different variants of the code review tool.
-
-Before running this script, you may need to set the following environment
-variables:
-    - BUGBUG_PHABRICATOR_URL
-    - BUGBUG_PHABRICATOR_TOKEN
-    - BUGBUG_*_API_KEY (replace * with your LLM provider)
-    - BUGBUG_QDRANT_API_KEY
-    - BUGBUG_QDRANT_LOCATION
-
-To specify different variants to evaluate, please modify the get_tool_variants
-function.
-"""
-
-import os
-from collections import defaultdict
-from datetime import datetime, timedelta
-
-import numpy as np
-import pandas as pd
-from langchain_classic.prompts import PromptTemplate
-from langchain_core.runnables import RunnableSequence
-from tabulate import tabulate
-
-from bugbug import db, phabricator, utils
-from bugbug.code_search.mozilla import FunctionSearchMozilla
-from bugbug.tools import code_review
-from bugbug.tools.code_review.utils import parse_model_output
-from bugbug.tools.core import llms
-from bugbug.tools.core.exceptions import ModelResultError
-from bugbug.vectordb import QdrantVectorDB
-
-VERBOSE_CODE_REVIEW = False
-
-
-EVALUATION_TEMPLATE = """Your are an expert in code review at Mozilla Firefox.
-
-**Task**:
-
-Match two sets of code review comments to identify redundant comments.
-
-**Instructions**:
-
-    1. **Consider the following about all comments**:
-        - The comments are related to the same code patch.
-        - The comments may be written in different styles.
-
-    2. **Understand what each comment is addressing**:
-        - Read the comments in both sets.
-        - Understand the issue that each comment is addressing.
-
-    3. **Check for matches**:
-        - If you find a comment in the old set that is addressing the same issue as a comment in the new set, link them as redundant.
-        - The comments may not be identical, but they should be addressing the same issue.
-        - The level of detail in the comments may vary.
-
-    4. **Output format**:
-        - Output a list of matched comments.
-        - Include the comment IDs only in the output.
-        - Each element in the list should be an object with two keys: `old_comment_id` and `new_comment_id`.
-        - No explanation is needed in the output, only the IDs of the matched comments.
-        - The output should be a valid json only.
-
-
-**Output example**:
-
-    [
-        {{"old_comment_id": 1, "new_comment_id": 3}},
-        {{"old_comment_id": 4, "new_comment_id": 2}},
-    ]
-
-**First set of comments (old comments)**:
-
-{old_comments}
-
-**Second set of comments (new comments)**:
-
-{new_comments}
-"""
-
-
-class FeedbackEvaluator:
-    def __init__(self, evaluation_dataset: str):
-        self.evaluated_comments = pd.read_csv(evaluation_dataset)
-
-        llm = llms.create_openai_llm()
-        evaluate_comments_prompt = PromptTemplate.from_template(EVALUATION_TEMPLATE)
-        self.evaluation_chain = RunnableSequence(evaluate_comments_prompt, llm)
-
-    def evaluate_diff_comments(
-        self,
-        diff_id: int,
-        new_comments: list[code_review.InlineComment],
-    ) -> list[dict]:
-        diff_evaluated_comments = self.evaluated_comments[
-            self.evaluated_comments["diff_id"] == diff_id
-        ].reset_index()
-        diff_evaluated_comments["evaluation"] = np.where(
-            diff_evaluated_comments["evaluation"].isin(["CORRECT", "VALID_REDUNDANT"]),
-            "VALID",
-            "INVALID",
-        )
-
-        output = self.evaluation_chain.invoke(
-            {
-                "old_comments": [
-                    {
-                        "id": i,
-                        "content": raw["comment"],
-                        "file": raw["file_path"],
-                    }
-                    for i, raw in diff_evaluated_comments.iterrows()
-                ],
-                "new_comments": [
-                    {
-                        "id": i,
-                        "content": comment.content,
-                        "file": comment.filename,
-                    }
-                    for i, comment in enumerate(new_comments)
-                ],
-            }
-        )
-
-        matches = parse_model_output(output.content)
-
-        results = [
-            {
-                "new_comment": comment.content,
-                "new_comment_order": comment.order,
-                "old_comments_count": 0,
-                "matched": False,
-            }
-            for comment in new_comments
-        ]
-        seen_old_comments = set()
-
-        for match in matches:
-            old_index = match["old_comment_id"]
-            new_index = match["new_comment_id"]
-
-            evaluated_comment = diff_evaluated_comments.iloc[old_index]
-            new_comment = new_comments[new_index]
-
-            if evaluated_comment["file_path"] != new_comment.filename:
-                print(
-                    "File mismatch:",
-                    evaluated_comment["file_path"],
-                    new_comment.filename,
-                )
-                continue
-
-            current_result = results[new_index]
-
-            current_result["evaluation"] = (
-                "MIXED"
-                if (
-                    "evaluation" in current_result
-                    and current_result["evaluation"] != evaluated_comment["evaluation"]
-                )
-                else evaluated_comment["evaluation"]
-            )
-
-            if "old_comment" in current_result:
-                current_result["old_comment"] += (
-                    f"\n\n-------------\n\n{evaluated_comment['comment']}"
-                )
-            else:
-                current_result["old_comment"] = evaluated_comment["comment"]
-
-            current_result["old_comments_count"] += 1
-            current_result["matched"] = True
-
-            seen_old_comments.add(old_index)
-
-        for i, raw in diff_evaluated_comments.iterrows():
-            if i in seen_old_comments:
-                continue
-
-            results.append(
-                {
-                    "old_comment": raw["comment"],
-                    "evaluation": raw["evaluation"],
-                    "old_comments_count": 1,
-                }
-            )
-
-        self.print_evaluation_matches(results)
-
-        return results
-
-    @staticmethod
-    def print_evaluation_matches(matching_results: list[dict]):
-        print(
-            tabulate(
-                [
-                    (
-                        result.get("new_comment", ""),
-                        result.get("old_comment", ""),
-                        result.get("evaluation", ""),
-                    )
-                    for result in matching_results
-                ],
-                tablefmt="mixed_grid",
-                headers=[
-                    "New Comment",
-                    "Old Comment",
-                    "Evaluation",
-                ],
-                maxcolwidths=[
-                    60,
-                    60,
-                    20,
-                ],
-            )
-        )
-
-
-def get_tool_variants(
-    variants: list[str],
-) -> list[tuple[str, code_review.CodeReviewTool]]:
-    """Returns a list of tool variants to evaluate.
-
-    Returns:
-        List of tuples, where each tuple contains the name of the variant and
-        and instance of the code review tool to evaluate.
-    """
-    # Step 1: we start with instantiating the dependencies based on the selected
-    # variants.
-    repo_dir = "../mozilla-unified"
-
-    def get_file(commit_hash, path):
-        r = utils.get_session("hgmo").get(
-            f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}",
-            headers={
-                "User-Agent": utils.get_user_agent(),
-            },
-        )
-        r.raise_for_status()
-        return r.text
-
-    function_search = FunctionSearchMozilla(repo_dir, get_file, True)
-
-    review_comments_db = code_review.ReviewCommentsDB(QdrantVectorDB("diff_comments"))
-
-    suggestions_feedback_db = code_review.SuggestionsFeedbackDB(
-        QdrantVectorDB("suggestions_feedback")
-    )
-
-    # Step 2: we create the selected tool variants.
-
-    tool_variants = []
-
-    if "claude" in variants:
-        tool_variants.append(
-            (
-                "Claude",
-                code_review.CodeReviewTool.create(
-                    function_search=function_search,
-                    review_comments_db=review_comments_db,
-                    suggestions_feedback_db=suggestions_feedback_db,
-                    verbose=VERBOSE_CODE_REVIEW,
-                ),
-            )
-        )
-
-    if "gpt" in variants:
-        llm = llms.create_openai_llm()
-        tool_variants.append(
-            (
-                "GPT",
-                code_review.CodeReviewTool.create(
-                    llm=llm,
-                    summarization_llm=llm,
-                    filtering_llm=llm,
-                    function_search=function_search,
-                    review_comments_db=review_comments_db,
-                    suggestions_feedback_db=suggestions_feedback_db,
-                    verbose=VERBOSE_CODE_REVIEW,
-                ),
-            )
-        )
-
-    return tool_variants
-
-
-def get_review_requests_sample(since: timedelta, limit: int):
-    assert db.download(phabricator.REVISIONS_DB)
-
-    start_date = (datetime.now() - since).timestamp()
-    MOZILLA_CENTRAL_PHID = "PHID-REPO-saax4qdxlbbhahhp2kg5"
-
-    n = 0
-    for review_request in phabricator.get_revisions():
-        if (
-            review_request["fields"]["repositoryPHID"] != MOZILLA_CENTRAL_PHID
-            or review_request["fields"]["dateCreated"] <= start_date
-        ):
-            continue
-
-        if n >= limit >= 0:
-            break
-
-        yield review_request["id"]
-        n += 1
-
-
-def print_prettified_comments(comments: list[code_review.InlineComment]):
-    if not comments:
-        print("No comments to show.")
-        return
-
-    print(
-        tabulate(
-            [
-                (
-                    comment.filename,
-                    comment.end_line,
-                    comment.content,
-                )
-                for comment in comments
-            ],
-            headers=[
-                "File",
-                "Line",
-                "Comment",
-            ],
-            maxcolwidths=[
-                30,
-                10,
-                100,
-            ],
-        ),
-    )
-
-
-def get_latest_evaluation_results_file(results_dir: str | None):
-    import glob
-    import os
-
-    files = glob.glob("evaluation_results_*#*.csv", root_dir=results_dir)
-    if not files:
-        raise FileNotFoundError("No evaluation results file found.")
-
-    latests_files = max(files)
-    if results_dir:
-        return os.path.join(results_dir, latests_files)
-
-    return latests_files
-
-
-def get_ongoing_evaluation_results_file(results_dir: str | None):
-    import glob
-    import os
-
-    base_file = get_latest_evaluation_results_file(results_dir)
-    files = [
-        file
-        for file in glob.glob("evaluation_results_*.csv", root_dir=results_dir)
-        if "#" not in file and file > base_file
-    ]
-    if not files:
-        raise FileNotFoundError("No ongoing evaluation results file found.")
-
-    latests_file = max(files)
-    if results_dir:
-        return os.path.join(results_dir, latests_file)
-
-    return latests_file
-
-
-def main(args):
-    review_platform = "phabricator"
-    review_data: code_review.ReviewData = code_review.review_data_classes[
-        review_platform
-    ]()
-
-    tool_variants = get_tool_variants(args.variants)
-
-    evaluator = FeedbackEvaluator(args.evaluation_dataset)
-
-    result_file = os.path.join(
-        args.results_dir,
-        "code_review_tool_evaluator.csv",
-    )
-    is_first_result = not os.path.exists(result_file)
-
-    if is_first_result:
-        evaluation_results_file = os.path.join(
-            args.results_dir,
-            f"evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv",
-        )
-        seen_patches = set()
-    else:
-        evaluation_results_file = get_ongoing_evaluation_results_file(args.results_dir)
-        seen_patches = set(pd.read_csv(evaluation_results_file)["diff_id"].to_list())
-
-    result_unique_columns = ["Review Request ID", "File", "Line", "Comment Number"]
-    result_all_columns = result_unique_columns + [
-        f"{title} ({variant_name})"
-        for variant_name, _ in tool_variants
-        for title in ("Comment", "Evaluation")
-    ]
-    evaluation_result_all_columns = [
-        "variant_name",
-        "revision_id",
-        "diff_id",
-        "new_comment",
-        "new_comment_order",
-        "old_comments_count",
-        "matched",
-        "old_comment",
-        "evaluation",
-    ]
-
-    selected_review_requests = []
-    if args.diff_ids:
-        selected_review_requests = (
-            ("n/a", code_review.ReviewRequest(diff_id)) for diff_id in args.diff_ids
-        )
-    elif args.review_request_ids:
-        selected_review_requests = (
-            (review_request_id, review_data.get_review_request_by_id(review_request_id))
-            for review_request_id in args.review_request_ids
-        )
-    elif args.evaluation_strategy == "random":
-        print("No review request IDs specified. Selecting a random sample.")
-        selected_review_requests = (
-            (revision_id, code_review.ReviewRequest(diff_id))
-            for revision_id, diff_id in evaluator.evaluated_comments.query(
-                "evaluation == 'CORRECT'"
-            )[["revision_id", "diff_id"]]
-            .drop_duplicates()
-            .sample(20)
-            .itertuples(index=False)
-        )
-    elif args.evaluation_strategy == "same":
-        selected_review_requests = (
-            (revision_id, code_review.ReviewRequest(diff_id))
-            for revision_id, diff_id in pd.read_csv(
-                get_latest_evaluation_results_file(args.results_dir),
-            )[["revision_id", "diff_id"]]
-            .drop_duplicates()
-            .itertuples(name=None, index=False)
-        )
-    else:
-        raise ValueError(
-            "Please specify either --diff-id or --revision-id. Alternatively, use --evaluation-strategy."
-        )
-
-    for review_request_id, review_request in selected_review_requests:
-        if review_request_id in [227266, 233414]:
-            print(
-                f"Skipping Review Request ID {review_request_id} because it is known to cause issues."
-            )
-            continue
-
-        if review_request.patch_id in seen_patches:
-            print(
-                f"Skipping Review Request ID {review_request_id} (Diff ID {review_request.patch_id}) because it was already evaluated."
-            )
-            continue
-
-        print("---------------------------------------------------------")
-        print(f"Review Request ID: {review_request_id}")
-        print(f"Patch ID: {review_request.patch_id}")
-        patch = review_data.get_patch_by_id(review_request.patch_id)
-        print("---------------------------------------------------------")
-
-        if len(patch.raw_diff) > 20_000:
-            print("Skipping the patch because it is too large.")
-            continue
-
-        all_variants_results = []
-        all_variants_evaluation_results = []
-        for variant_name, tool in tool_variants:
-            print(f"\n\nVariant: {variant_name}\n")
-            try:
-                comments = tool.run(patch)
-            except code_review.FileNotInPatchError as e:
-                print("Error while running the tool:", e)
-                continue
-            except code_review.LargeDiffError:
-                print("Skipping the patch because it is too large.")
-                continue
-            except ModelResultError as e:
-                print("Error while running the tool:", e)
-                continue
-
-            print_prettified_comments(comments)
-            comment_per_line_counter = defaultdict(int)
-
-            evaluation = evaluator.evaluate_diff_comments(
-                review_request.patch_id, comments
-            )
-
-            all_variants_evaluation_results.extend(
-                {
-                    "variant_name": variant_name,
-                    "revision_id": review_request_id,
-                    "diff_id": review_request.patch_id,
-                    **row,
-                }
-                for row in evaluation
-            )
-
-            for i, comment in enumerate(comments):
-                key = (review_request_id, comment.filename, comment.end_line)
-                comment_per_line_counter[key] += 1
-
-                all_variants_results.append(
-                    {
-                        "Review Request ID": review_request_id,
-                        "File": comment.filename,
-                        "Line": comment.end_line,
-                        "Comment Number": comment_per_line_counter[key],
-                        f"Comment ({variant_name})": comment.content,
-                        f"Evaluation ({variant_name})": evaluation[i].get("evaluation"),
-                    }
-                )
-
-        df = (
-            pd.DataFrame(all_variants_results, columns=result_all_columns)
-            .groupby(result_unique_columns)
-            .first()
-        )
-        df.to_csv(
-            result_file,
-            header=is_first_result,
-            mode="w" if is_first_result else "a",
-        )
-
-        df = pd.DataFrame(
-            all_variants_evaluation_results, columns=evaluation_result_all_columns
-        )
-        df.to_csv(
-            evaluation_results_file,
-            index=False,
-            header=is_first_result,
-            mode="w" if is_first_result else "a",
-        )
-
-        if is_first_result:
-            is_first_result = False
-            print("You can find the results in the file:", result_file)
-
-        print("\n\n\n")
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "-r",
-        "--revision-id",
-        dest="review_request_ids",
-        action="append",
-        help="if specified, run only the selected Revision ID(s)",
-        metavar="REVISION_ID",
-        type=int,
-    )
-    parser.add_argument(
-        "-d",
-        "--diff-id",
-        dest="diff_ids",
-        action="append",
-        help="if specified, run only the selected Diff ID(s)",
-        metavar="DIFF_ID",
-        type=int,
-    )
-    parser.add_argument(
-        "--evaluation-data",
-        dest="evaluation_dataset",
-        action="store",
-        help="the path or the URL to a evaluation dataset in CSV format",
-    )
-
-    parser.add_argument(
-        "--results-dir",
-        dest="results_dir",
-        action="store",
-        help="the directory to store the results and read previous results",
-    )
-
-    parser.add_argument(
-        "--evaluation-strategy",
-        dest="evaluation_strategy",
-        action="store",
-        help="the evaluation strategy to use",
-    )
-    parser.add_argument(
-        "--variant",
-        dest="variants",
-        action="append",
-        help="the variants to use, use multiple times for multiple variants",
-        choices=["claude", "gpt"],
-        required=True,
-    )
-
-    args = parser.parse_args()
-
-    if args.diff_ids and args.review_request_ids:
-        parser.error("Please specify either --diff-id or --revision-id, not both.")
-
-    main(args)
diff --git a/scripts/code_review_tool_evaluator_report.py b/scripts/code_review_tool_evaluator_report.py
deleted file mode 100644
index 801be5bcb4..0000000000
--- a/scripts/code_review_tool_evaluator_report.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# %%
-
-import pandas as pd
-
-import scripts.code_review_tool_evaluator as evaluator_script
-
-evaluation_results = pd.read_csv(
-    # evaluator_script.get_latest_evaluation_results_file("../evaluation_results")
-    evaluator_script.get_ongoing_evaluation_results_file("../evaluation_results")
-)
-
-# %%
-
-variant_names = evaluation_results["variant_name"].unique()
-variant_name = variant_names[0]
-
-df = evaluation_results[evaluation_results["variant_name"] == variant_name]
-
-
-# %%
-new_comments_count = df["new_comment"].count()
-new_valid_comments = len(df[~df["new_comment"].isna() & (df["evaluation"] == "VALID")])
-new_invalid_comments = len(
-    df[~df["new_comment"].isna() & (df["evaluation"] == "INVALID")]
-)
-new_unevaluated_comments = len(df[~df["new_comment"].isna() & df["evaluation"].isna()])
-
-old_comments_count = df["old_comments_count"].sum()
-old_valid_comments = df[df["evaluation"] == "VALID"]["old_comments_count"].sum()
-old_invalid_comments = df[df["evaluation"] == "INVALID"]["old_comments_count"].sum()
-
-matched_valid_comments = df[
-    ~df["new_comment"].isna()
-    & ~df["old_comment"].isna()
-    & (df["evaluation"] == "VALID")
-]["old_comments_count"].sum()
-matched_invalid_comments = df[
-    ~df["new_comment"].isna()
-    & ~df["old_comment"].isna()
-    & (df["evaluation"] == "INVALID")
-]["old_comments_count"].sum()
-
-print("--------------------")
-print("Variant Name:", variant_name)
-print("--------------------")
-print("New Comments:", new_comments_count)
-print("New Valid Comments:", new_valid_comments)
-print("New Invalid Comments:", new_invalid_comments)
-print("New Unevaluated Comments:", new_unevaluated_comments)
-print("--------------------")
-print("Old Comments:", old_comments_count)
-print("Old Valid Comments:", old_valid_comments)
-print("Old Invalid Comments:", old_invalid_comments)
-print("--------------------")
-print(
-    "Recalled comments:",
-    (matched_valid_comments + matched_invalid_comments) / old_comments_count * 100,
-)
-print("Recalled valid comments:", matched_valid_comments / old_valid_comments * 100)
-print(
-    "Recalled invalid comments:", matched_invalid_comments / old_invalid_comments * 100
-)
-print("--------------------")
-print(
-    "Missed valid comments:",
-    (old_valid_comments - matched_valid_comments) / old_valid_comments * 100,
-)
-print(
-    "Missed invalid comments:",
-    (old_invalid_comments - matched_invalid_comments) / old_invalid_comments * 100,
-)
-
-
-# %%
-
-
-df = evaluation_results[
-    evaluation_results["evaluation"].isin(["VALID", "INVALID"])
-    & ~evaluation_results["new_comment"].isna()
-].sort_values(by=["evaluation", "revision_id", "new_comment"])
-
-
-df["id"] = df["diff_id"].astype(str) + " | " + df["new_comment"].astype(str)
-df_old = df[df["variant_name"] == variant_names[0]]
-df_new = df[df["variant_name"] == variant_names[1]]
-
-in_new_but_not_in_old = df_new[~df_new["id"].isin(df_old["id"])]
-
-print(
-    "Examples of comments that were filtered by the old version but were not filtered by the new version:\n"
-)
-print(
-    in_new_but_not_in_old[["revision_id", "new_comment", "evaluation"]].to_markdown(
-        index=False
-    )
-)
-
-
-in_old_but_not_in_new = df_old[~df_old["id"].isin(df_new["id"])]
-print(
-    "\n\nExamples of comments that were filtered by the new version but were not filtered by the old version:\n"
-)
-print(
-    in_old_but_not_in_new[["revision_id", "new_comment", "evaluation"]].to_markdown(
-        index=False
-    )
-)
-
-# %%