diff --git a/bugbug/code_search/mozilla.py b/bugbug/code_search/mozilla.py
index b80c25db0a..1c83af4578 100644
--- a/bugbug/code_search/mozilla.py
+++ b/bugbug/code_search/mozilla.py
@@ -10,10 +10,10 @@
class FunctionSearchMozilla(FunctionSearch):
- def __init__(self, repo_dir, get_file, fast=False):
+ def __init__(self, repo_dir, get_file=None, fast=False):
super().__init__()
self.repo_dir = repo_dir
- self.get_file = get_file
+ self.get_file = get_file or FunctionSearchSearchfoxAPI._get_file
self.fast = fast
def get_function(
diff --git a/bugbug/code_search/searchfox_api.py b/bugbug/code_search/searchfox_api.py
index 9aa7b76d5a..56d6579943 100644
--- a/bugbug/code_search/searchfox_api.py
+++ b/bugbug/code_search/searchfox_api.py
@@ -214,9 +214,18 @@ def search(commit_hash, symbol_name):
class FunctionSearchSearchfoxAPI(FunctionSearch):
- def __init__(self, get_file):
+ def __init__(self, get_file=None):
super().__init__()
- self.get_file = get_file
+ self.get_file = get_file or self._get_file
+
+ @staticmethod
+ def _get_file(commit_hash, path):
+ r = utils.get_session("hgmo").get(
+ f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}",
+ headers={"User-Agent": utils.get_user_agent()},
+ )
+ r.raise_for_status()
+ return r.text
def definitions_to_results(self, commit_hash, definitions):
result = []
diff --git a/bugbug/tools/base.py b/bugbug/tools/base.py
index c36b1a19c5..2680bd5f85 100644
--- a/bugbug/tools/base.py
+++ b/bugbug/tools/base.py
@@ -8,13 +8,5 @@
class GenerativeModelTool(ABC):
- @property
- @abstractmethod
- def version(self) -> str: ...
-
@abstractmethod
def run(self, *args, **kwargs) -> Any: ...
-
- @staticmethod
- def _print_answer(answer):
- print(f"\u001b[33;1m\033[1;3m{answer}\u001b[0m")
diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py
index 203d95e2ba..32bd38c971 100644
--- a/bugbug/tools/code_review/agent.py
+++ b/bugbug/tools/code_review/agent.py
@@ -9,44 +9,39 @@
import os
from datetime import datetime
from logging import getLogger
-from typing import Iterable, Optional
+from typing import Optional, Protocol
from langchain.agents import create_agent
from langchain.agents.structured_output import ProviderStrategy
from langchain.chat_models import BaseChatModel
from langchain.messages import HumanMessage
-from langchain_classic.chains import LLMChain
-from langchain_classic.prompts import PromptTemplate
from langgraph.errors import GraphRecursionError
from pydantic import BaseModel, Field
from unidiff import PatchSet
from bugbug.code_search.function_search import FunctionSearch
from bugbug.tools.base import GenerativeModelTool
-from bugbug.tools.code_review.database import ReviewCommentsDB, SuggestionsFeedbackDB
+from bugbug.tools.code_review.database import ReviewCommentsDB
from bugbug.tools.code_review.langchain_tools import (
CodeReviewContext,
create_find_function_definition_tool,
expand_context,
)
from bugbug.tools.code_review.prompts import (
- DEFAULT_REJECTED_EXAMPLES,
FIRST_MESSAGE_TEMPLATE,
- PROMPT_TEMPLATE_FILTERING_ANALYSIS,
- PROMPT_TEMPLATE_SUMMARIZATION,
STATIC_COMMENT_EXAMPLES,
SYSTEM_PROMPT_TEMPLATE,
TEMPLATE_COMMENT_EXAMPLE,
TEMPLATE_PATCH_FROM_HUNK,
)
from bugbug.tools.code_review.utils import (
+ convert_generated_comments_to_inline,
format_patch_set,
- generate_processed_output,
)
from bugbug.tools.core.data_types import InlineComment
from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError
from bugbug.tools.core.llms import get_tokenizer
-from bugbug.tools.core.platforms.base import Patch
+from bugbug.tools.core.platforms.base import Patch, ReviewData
logger = getLogger(__name__)
@@ -73,25 +68,35 @@ class AgentResponse(BaseModel):
)
-class CodeReviewTool(GenerativeModelTool):
- version = "0.0.1"
+class PatchSummarizer(Protocol):
+ def run(self, patch: Patch) -> str: ...
+
+
+class SuggestionFilterer(Protocol):
+ def run(
+ self, suggestions: list[GeneratedReviewComment]
+ ) -> list[GeneratedReviewComment]: ...
+
+class CodeReviewTool(GenerativeModelTool):
def __init__(
self,
llm: BaseChatModel,
- summarization_llm: BaseChatModel,
- filtering_llm: BaseChatModel,
+ patch_summarizer: PatchSummarizer,
+ suggestion_filterer: SuggestionFilterer,
+ review_data: ReviewData,
function_search: Optional[FunctionSearch] = None,
review_comments_db: Optional["ReviewCommentsDB"] = None,
show_patch_example: bool = False,
verbose: bool = True,
- suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None,
target_software: str = "Mozilla Firefox",
) -> None:
super().__init__()
self.target_software = target_software
+ self.review_data = review_data
+
self._tokenizer = get_tokenizer(
llm.model_name if hasattr(llm, "model_name") else ""
)
@@ -108,24 +113,8 @@ def __init__(
"----------------------------------------------------"
)
- self.summarization_chain = LLMChain(
- prompt=PromptTemplate.from_template(
- PROMPT_TEMPLATE_SUMMARIZATION,
- partial_variables={
- "experience_scope": f"the {self.target_software} source code"
- },
- ),
- llm=summarization_llm,
- verbose=verbose,
- )
- self.filtering_chain = LLMChain(
- prompt=PromptTemplate.from_template(
- PROMPT_TEMPLATE_FILTERING_ANALYSIS,
- partial_variables={"target_code_consistency": self.target_software},
- ),
- llm=filtering_llm,
- verbose=verbose,
- )
+ self.patch_summarizer = patch_summarizer
+ self.suggestion_filterer = suggestion_filterer
tools = [expand_context]
if function_search:
@@ -146,60 +135,77 @@ def __init__(
self.verbose = verbose
- self.suggestions_feedback_db = suggestions_feedback_db
-
@staticmethod
- def create(
- llm=None, summarization_llm=None, filtering_llm=None, **kwargs
- ) -> "CodeReviewTool":
- from bugbug.tools.core.llms import create_anthropic_llm
-
- return CodeReviewTool(
- llm=llm
- or create_anthropic_llm(
+ def create(**kwargs):
+ """Factory method to instantiate the tool with default dependencies.
+
+ This method takes the same parameters as the constructor, but all
+ parameters are optional. If a parameter is not provided, a default
+ component will be created and used.
+ """
+ if "function_search" not in kwargs:
+ from bugbug.code_search.searchfox_api import FunctionSearchSearchfoxAPI
+
+ kwargs["function_search"] = FunctionSearchSearchfoxAPI()
+
+ if "review_comments_db" not in kwargs:
+ from bugbug.tools.code_review.database import ReviewCommentsDB
+ from bugbug.vectordb import QdrantVectorDB
+
+ kwargs["review_comments_db"] = ReviewCommentsDB(
+ QdrantVectorDB("diff_comments")
+ )
+
+ if "review_data" not in kwargs:
+ from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData
+
+ kwargs["review_data"] = PhabricatorReviewData()
+
+ if "llm" not in kwargs:
+ from bugbug.tools.core.llms import create_anthropic_llm
+
+ kwargs["llm"] = create_anthropic_llm(
model_name="claude-opus-4-5-20251101",
max_tokens=40_000,
temperature=None,
thinking={"type": "enabled", "budget_tokens": 10_000},
- ),
- summarization_llm=summarization_llm or create_anthropic_llm(),
- filtering_llm=filtering_llm or create_anthropic_llm(),
- **kwargs,
- )
+ )
- def count_tokens(self, text):
- return len(self._tokenizer.encode(text))
+ if "patch_summarizer" not in kwargs:
+ from bugbug.tools.patch_summarization.agent import PatchSummarizationTool
- def generate_initial_prompt(self, patch: Patch) -> str:
- formatted_patch = format_patch_set(patch.patch_set)
+ kwargs["patch_summarizer"] = PatchSummarizationTool.create()
- output_summarization = self.summarization_chain.invoke(
- {
- "patch": formatted_patch,
- "bug_title": patch.bug_title,
- "patch_title": patch.patch_title,
- "patch_description": patch.patch_description,
- },
- return_only_outputs=True,
- )["text"]
+ if "suggestion_filterer" not in kwargs:
+ from bugbug.tools.suggestion_filtering.agent import SuggestionFilteringTool
- if self.verbose:
- GenerativeModelTool._print_answer(output_summarization)
+ kwargs["suggestion_filterer"] = SuggestionFilteringTool.create()
+ return CodeReviewTool(**kwargs)
+
+ def count_tokens(self, text):
+ return len(self._tokenizer.encode(text))
+
+ def generate_initial_prompt(self, patch: Patch, patch_summary: str) -> str:
created_before = patch.date_created if self.is_experiment_env else None
+
return FIRST_MESSAGE_TEMPLATE.format(
- patch=formatted_patch,
- patch_summarization=output_summarization,
+ patch=format_patch_set(patch.patch_set),
+ patch_summarization=patch_summary,
comment_examples=self._get_comment_examples(patch, created_before),
approved_examples=self._get_generated_examples(patch, created_before),
)
- def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]:
+ def generate_review_comments(
+ self, patch: Patch, patch_summary: str
+ ) -> list[GeneratedReviewComment]:
try:
for chunk in self.agent.stream(
{
"messages": [
- HumanMessage(self.generate_initial_prompt(patch)),
+ HumanMessage(
+ self.generate_initial_prompt(patch, patch_summary)
+ ),
]
},
context=CodeReviewContext(patch=patch),
@@ -212,35 +218,26 @@ def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]:
return result["structured_response"].comments
+ def run_by_diff_id(self, diff_id: str) -> list[InlineComment] | None:
+ patch = self.review_data.get_patch_by_id(diff_id)
+ return self.run(patch)
+
def run(self, patch: Patch) -> list[InlineComment] | None:
if self.count_tokens(patch.raw_diff) > 21000:
raise LargeDiffError("The diff is too large")
- unfiltered_suggestions = self._generate_suggestions(patch)
+ patch_summary = self.patch_summarizer.run(patch)
+
+ unfiltered_suggestions = self.generate_review_comments(patch, patch_summary)
if not unfiltered_suggestions:
logger.info("No suggestions were generated")
return []
- rejected_examples = (
- "\n - ".join(self.get_similar_rejected_comments(unfiltered_suggestions))
- if self.suggestions_feedback_db
- else DEFAULT_REJECTED_EXAMPLES
- )
-
- raw_output = self.filtering_chain.invoke(
- {
- "comments": str(
- [comment.model_dump() for comment in unfiltered_suggestions]
- ),
- "rejected_examples": rejected_examples,
- },
- return_only_outputs=True,
- )["text"]
-
- if self.verbose:
- GenerativeModelTool._print_answer(raw_output)
+ filtered_suggestions = self.suggestion_filterer.run(unfiltered_suggestions)
- return list(generate_processed_output(raw_output, patch.patch_set))
+ return list(
+ convert_generated_comments_to_inline(filtered_suggestions, patch.patch_set)
+ )
def _get_generated_examples(self, patch, created_before: datetime | None = None):
"""Get examples of comments that were generated by an LLM.
@@ -320,24 +317,3 @@ def generate_formatted_patch_from_raw_hunk(raw_hunk, filename):
)
for num, example in enumerate(comment_examples)
)
-
- def get_similar_rejected_comments(
- self, suggestions: list[GeneratedReviewComment]
- ) -> Iterable[str]:
- if not self.suggestions_feedback_db:
- raise Exception("Suggestions feedback database is not available")
-
- num_examples_per_suggestion = 10 // len(suggestions) or 1
- seen_ids: set[int] = set()
-
- for suggestion in suggestions:
- similar_rejected_suggestions = (
- self.suggestions_feedback_db.find_similar_rejected_suggestions(
- suggestion.comment,
- limit=num_examples_per_suggestion,
- excluded_ids=seen_ids,
- )
- )
- for rejected_suggestion in similar_rejected_suggestions:
- seen_ids.add(rejected_suggestion.id)
- yield rejected_suggestion.comment
diff --git a/bugbug/tools/code_review/prompts.py b/bugbug/tools/code_review/prompts.py
index 9b253c026b..7bffd5c442 100644
--- a/bugbug/tools/code_review/prompts.py
+++ b/bugbug/tools/code_review/prompts.py
@@ -7,31 +7,6 @@
TARGET_SOFTWARE: str | None = None
-PROMPT_TEMPLATE_SUMMARIZATION = """You are an expert reviewer for {experience_scope}, with experience on source code reviews.
-
-Please, analyze the code provided and report a summarization about the new changes; for that, focus on the coded added represented by lines that start with "+".
-
-The summarization should have two parts:
- 1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request.
- 2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent.
-
-Do not include any code in the summarization, only a description of the changes.
-
-**Bug title**:
-
-{bug_title}
-
-
-**Commit message**:
-
-{patch_title}
-{patch_description}
-
-
-**Diff**:
-
-{patch}
-"""
SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing a pull request and providing high-quality review comments. You will examine a code patch and generate constructive feedback focusing on potential issues in the changed code.
@@ -113,36 +88,6 @@
{comments}"""
-PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that:
-- are consistent with the {target_code_consistency} source code;
-- focus on reporting possible bugs, functional regressions, issues, or similar concerns;
-- report readability or design concerns.
-
-Exclude comments that:
-- only describe the change;
-- restate obvious facts like renamed variables or replaced code;
-- include praising;
-- ask if changes are intentional or ask to ensure things exist.
-
-Only return a valid JSON list. Do not drop any key from the JSON objects.
-
-Comments:
-{comments}
-
-As examples of not expected comments, not related to the current patch, please, check some below:
- - {rejected_examples}
-"""
-
-
-DEFAULT_REJECTED_EXAMPLES = """Please note that these are minor improvements and the overall quality of the patch is good. The documentation is being expanded in a clear and structured way, which will likely be beneficial for future development.
- - Please note that these are just suggestions and the code might work perfectly fine as it is. It's always a good idea to test all changes thoroughly to ensure they work as expected.
- - Overall, the patch seems to be well implemented with no major concerns. The developers have made a conscious decision to align with Chrome's behavior, and the reasoning is well documented.
- - There are no complex code changes in this patch, so there's no potential for major readability regressions or bugs introduced by the changes.
- - The `focus(...)` method is called without checking if the element and its associated parameters exist or not. It would be better to check if the element exists before calling the `focus()` method to avoid potential errors.
- - It's not clear if the `SearchService.sys.mjs` file exists or not. If it doesn't exist, this could cause an error. Please ensure that the file path is correct.
- - This is a good addition to the code."""
-
-
STATIC_COMMENT_EXAMPLES = [
{
"comment": {
diff --git a/bugbug/tools/code_review/scorer.py b/bugbug/tools/code_review/scorer.py
new file mode 100644
index 0000000000..fac5aab475
--- /dev/null
+++ b/bugbug/tools/code_review/scorer.py
@@ -0,0 +1,285 @@
+from functools import cached_property
+from logging import getLogger
+
+import weave
+
+from bugbug.tools.comment_matching.agent import CommentMatchingTool
+from bugbug.tools.suggestion_filtering.agent import SuggestionFilteringTool
+
+logger = getLogger(__name__)
+
+
+class BasicMetricsScorer(weave.Scorer):
+ """Score basic metrics: comment counts and error tracking."""
+
+ @weave.op()
+ def score(
+ self,
+ output: dict,
+ ground_truth_comments: list[dict],
+ ) -> dict:
+ valid_comment_count = sum(
+ comment["evaluation"] == "VALID" for comment in ground_truth_comments
+ )
+ invalid_comment_count = sum(
+ comment["evaluation"] == "INVALID" for comment in ground_truth_comments
+ )
+
+ return {
+ "generated_comment_count": len(output["comments"]),
+ "ground_truth_valid_count": valid_comment_count,
+ "ground_truth_invalid_count": invalid_comment_count,
+ "ground_truth_total_count": len(ground_truth_comments),
+ "successful": not output["error"],
+ }
+
+ def summarize(self, score_rows: list[dict]) -> dict:
+ """Aggregate scores across all examples."""
+ total_generated = sum(r["generated_comment_count"] for r in score_rows)
+ total_gt_valid = sum(r["ground_truth_valid_count"] for r in score_rows)
+ total_gt_invalid = sum(r["ground_truth_invalid_count"] for r in score_rows)
+ total_gt = sum(r["ground_truth_total_count"] for r in score_rows)
+ error_count = sum(not r["successful"] for r in score_rows)
+
+ return {
+ "total_generated_comments": total_generated,
+ "total_ground_truth_valid": total_gt_valid,
+ "total_ground_truth_invalid": total_gt_invalid,
+ "total_ground_truth": total_gt,
+ "avg_generated_per_diff": (
+ total_generated / len(score_rows) if score_rows else 0
+ ),
+ "error_rate": error_count / len(score_rows) if score_rows else 0,
+ "num_examples": len(score_rows),
+ }
+
+
+class LLMCommentMatchingScorer(weave.Scorer):
+ """Score comment matching using LLM-based semantic comparison.
+
+ This scorer uses an LLM to match generated comments against ground truth
+ comments, calculating recall and precision metrics.
+ """
+
+ @cached_property
+ def matching_tool(self):
+ return CommentMatchingTool.create()
+
+ @cached_property
+ def filtering_tool(self):
+ return SuggestionFilteringTool.create()
+
+ @weave.op()
+ def score(
+ self,
+ output: dict,
+ ground_truth_comments: list[dict],
+ diff_id: int,
+ ) -> dict:
+ generated_comments = output["comments"]
+
+ retained_indices = set(
+ self.filtering_tool.get_indices_of_retained_comments(generated_comments)
+ )
+ retained_comments = [
+ c for i, c in enumerate(generated_comments) if i in retained_indices
+ ]
+ excluded_comments = [
+ c for i, c in enumerate(generated_comments) if i not in retained_indices
+ ]
+
+ old_comments = [
+ {"id": i, "content": c["comment"], "file": c["file_path"]}
+ for i, c in enumerate(ground_truth_comments)
+ ]
+
+ new_comments = [
+ {"id": i, "content": c.comment, "file": c.file}
+ for i, c in enumerate(generated_comments)
+ ]
+
+ matches = self.matching_tool.run(
+ old_comments=old_comments, new_comments=new_comments
+ )
+
+ seen_old: set[int] = set()
+ seen_new: set[int] = set()
+ matched_valid_retained = []
+ matched_valid_excluded = []
+ matched_invalid_retained = []
+ matched_invalid_excluded = []
+
+ for match in matches:
+ old_idx = match.old_comment_id
+ new_idx = match.new_comment_id
+
+ if old_idx >= len(ground_truth_comments) or new_idx >= len(
+ generated_comments
+ ):
+ continue
+
+ # Validate file match
+ gt_comment = ground_truth_comments[old_idx]
+ gen_comment = generated_comments[new_idx]
+
+ if gt_comment["file_path"] != gen_comment.file:
+ logger.debug(
+ f"File mismatch for diff {diff_id}: "
+ f"{gt_comment['file_path']} != {gen_comment.file}"
+ )
+ continue
+
+ seen_old.add(old_idx)
+ seen_new.add(new_idx)
+
+ is_retained = new_idx in retained_indices
+ match_comments = {
+ "ground_truth_comment": gt_comment,
+ "generated_comment": gen_comment,
+ }
+
+ if gt_comment["evaluation"] == "VALID":
+ if is_retained:
+ matched_valid_retained.append(match_comments)
+ else:
+ matched_valid_excluded.append(match_comments)
+ else:
+ if is_retained:
+ matched_invalid_retained.append(match_comments)
+ else:
+ matched_invalid_excluded.append(match_comments)
+
+ unmatched_gt_valid = []
+ unmatched_gt_invalid = []
+
+ for i in range(len(ground_truth_comments)):
+ if i in seen_old:
+ continue
+
+ comment = ground_truth_comments[i]
+ evaluation = ground_truth_comments[i]["evaluation"]
+ if evaluation == "VALID":
+ unmatched_gt_valid.append(comment)
+ else:
+ unmatched_gt_invalid.append(comment)
+
+ unmatched_gen_retained = []
+ unmatched_gen_excluded = []
+
+ for i in range(len(generated_comments)):
+ if i in seen_new:
+ continue
+
+ comment = new_comments[i]
+ if i in retained_indices:
+ unmatched_gen_retained.append(comment)
+ else:
+ unmatched_gen_excluded.append(comment)
+
+ return {
+ # Matched counts (derived from lists)
+ "matched_valid_count": len(matched_valid_retained)
+ + len(matched_valid_excluded),
+ "matched_invalid_count": len(matched_invalid_retained)
+ + len(matched_invalid_excluded),
+ # Unmatched counts
+ "unmatched_generated_count": len(unmatched_gen_retained)
+ + len(unmatched_gen_excluded),
+ "unmatched_ground_truth_valid_count": len(unmatched_gt_valid),
+ "unmatched_ground_truth_invalid_count": len(unmatched_gt_invalid),
+ # Unmatched details
+ "unmatched_ground_truth_valid": unmatched_gt_valid,
+ "unmatched_ground_truth_invalid": unmatched_gt_invalid,
+ "unmatched_gen_retained": unmatched_gen_retained,
+ "unmatched_gen_excluded": unmatched_gen_excluded,
+ # Filtering metrics
+ "filtering_retained_count": len(retained_comments),
+ "filtering_excluded_count": len(excluded_comments),
+ "filtering_retention_rate": (
+ len(retained_comments) / len(generated_comments)
+ if generated_comments
+ else 0
+ ),
+ "filtering_retained_comments": retained_comments,
+ "filtering_excluded_comments": excluded_comments,
+ # Filtering x Matching breakdown (lists with details)
+ "matched_valid_retained": matched_valid_retained,
+ "matched_valid_excluded": matched_valid_excluded,
+ "matched_invalid_retained": matched_invalid_retained,
+ "matched_invalid_excluded": matched_invalid_excluded,
+ }
+
+ def summarize(self, score_rows: list[dict]) -> dict:
+ total_matched_valid = sum(r["matched_valid_count"] for r in score_rows)
+ total_matched_invalid = sum(r["matched_invalid_count"] for r in score_rows)
+ total_unmatched_gen = sum(r["unmatched_generated_count"] for r in score_rows)
+ total_unmatched_gt_valid = sum(
+ r["unmatched_ground_truth_valid_count"] for r in score_rows
+ )
+ total_unmatched_gt_invalid = sum(
+ r["unmatched_ground_truth_invalid_count"] for r in score_rows
+ )
+
+ total_gt_valid = total_matched_valid + total_unmatched_gt_valid
+ total_gt_invalid = total_matched_invalid + total_unmatched_gt_invalid
+
+ # Filtering aggregates
+ total_retained = sum(r["filtering_retained_count"] for r in score_rows)
+ total_excluded = sum(r["filtering_excluded_count"] for r in score_rows)
+ total_generated = total_retained + total_excluded
+
+ # Filtering x Matching aggregates (use len() since values are lists)
+ total_matched_valid_retained = sum(
+ len(r["matched_valid_retained"]) for r in score_rows
+ )
+ total_matched_valid_excluded = sum(
+ len(r["matched_valid_excluded"]) for r in score_rows
+ )
+ total_matched_invalid_retained = sum(
+ len(r["matched_invalid_retained"]) for r in score_rows
+ )
+ total_matched_invalid_excluded = sum(
+ len(r["matched_invalid_excluded"]) for r in score_rows
+ )
+
+ return {
+ "total_matched_valid": total_matched_valid,
+ "total_matched_invalid": total_matched_invalid,
+ "total_unmatched_generated": total_unmatched_gen,
+ "recall_valid": (
+ total_matched_valid / total_gt_valid if total_gt_valid > 0 else 0
+ ),
+ "recall_invalid": (
+ total_matched_invalid / total_gt_invalid if total_gt_invalid > 0 else 0
+ ),
+ "missed_valid_rate": (
+ total_unmatched_gt_valid / total_gt_valid if total_gt_valid > 0 else 0
+ ),
+ "missed_invalid_rate": (
+ total_unmatched_gt_invalid / total_gt_invalid
+ if total_gt_invalid > 0
+ else 0
+ ),
+ # Filtering summary metrics
+ "total_filtering_retained": total_retained,
+ "total_filtering_excluded": total_excluded,
+ "overall_retention_rate": (
+ total_retained / total_generated if total_generated > 0 else 0
+ ),
+ # Filtering x Matching summary
+ "total_matched_valid_retained": total_matched_valid_retained,
+ "total_matched_valid_excluded": total_matched_valid_excluded,
+ "total_matched_invalid_retained": total_matched_invalid_retained,
+ "total_matched_invalid_excluded": total_matched_invalid_excluded,
+ # Derived rates
+ "false_exclusion_rate": (
+ total_matched_valid_excluded / total_matched_valid
+ if total_matched_valid > 0
+ else 0
+ ),
+ "true_exclusion_rate": (
+ total_matched_invalid_excluded / total_matched_invalid
+ if total_matched_invalid > 0
+ else 0
+ ),
+ }
diff --git a/bugbug/tools/code_review/utils.py b/bugbug/tools/code_review/utils.py
index ce5c5e85dc..9bc648e440 100644
--- a/bugbug/tools/code_review/utils.py
+++ b/bugbug/tools/code_review/utils.py
@@ -4,7 +4,6 @@
# You can obtain one at http://mozilla.org/MPL/2.0/.
-import json
import re
from itertools import chain
from logging import getLogger
@@ -343,37 +342,24 @@ def get_structured_functions(target, functions_declaration):
return function_declaration_text
-def parse_model_output(output: str) -> list[dict]:
- # TODO: This function should raise an exception when the output is not valid
- # JSON. The caller can then decide how to handle the error.
- # For now, we just log the error and return an empty list.
+def convert_generated_comments_to_inline(
+ comments, patch: PatchSet
+) -> Iterable[InlineComment]:
+ """Convert GeneratedReviewComment objects to InlineComment objects.
- json_opening_tag = output.find("```json")
- if json_opening_tag != -1:
- json_closing_tag = output.rfind("```")
- if json_closing_tag == -1:
- logger.error("No closing ``` found for JSON in model output: %s", output)
- return []
- output = output[json_opening_tag + len("```json") : json_closing_tag]
-
- try:
- comments = json.loads(output)
- except json.JSONDecodeError:
- logger.error("Failed to parse JSON from model output: %s", output)
- return []
-
- return comments
-
-
-def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineComment]:
- comments = parse_model_output(output)
+ Args:
+ comments: List of GeneratedReviewComment objects.
+ patch: The PatchSet to validate file paths against.
+ Yields:
+ InlineComment objects with proper scope information.
+ """
patched_files_map = {
patched_file.target_file: patched_file for patched_file in patch
}
for comment in comments:
- file_path = comment["file"]
+ file_path = comment.file
if not file_path.startswith("b/") and not file_path.startswith("a/"):
file_path = "b/" + file_path
@@ -385,7 +371,7 @@ def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineCo
f"The file `{file_path}` is not part of the patch: {list(patched_files_map)}"
)
- line_number = comment["code_line"]
+ line_number = comment.code_line
if not isinstance(line_number, int):
raise ModelResultError("Line number must be an integer")
@@ -401,8 +387,8 @@ def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineCo
end_line=line_number,
hunk_start_line=scope["line_start"],
hunk_end_line=scope["line_end"],
- content=comment["comment"],
+ content=comment.comment,
on_removed_code=not scope["has_added_lines"],
- explanation=comment["explanation"],
- order=comment["order"],
+ explanation=comment.explanation,
+ order=comment.order,
)
diff --git a/bugbug/tools/comment_matching/agent.py b/bugbug/tools/comment_matching/agent.py
new file mode 100644
index 0000000000..2da194a5b4
--- /dev/null
+++ b/bugbug/tools/comment_matching/agent.py
@@ -0,0 +1,62 @@
+from langchain.agents import create_agent
+from langchain.agents.structured_output import ProviderStrategy
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+from pydantic import BaseModel, Field
+
+from bugbug.tools.comment_matching.prompts import FIRST_MESSAGE_TEMPLATE, SYSTEM_PROMPT
+
+
+class MatchingComment(BaseModel):
+ id: int = Field(description="Unique identifier for the comment")
+ content: str = Field(description="Content of the code review comment")
+ file: str = Field(description="File path of the comment")
+
+
+class CommentMatch(BaseModel):
+ old_comment_id: int = Field(description="ID of the comment from the old set")
+ new_comment_id: int = Field(description="ID of the comment from the new set")
+
+
+class AgentResponse(BaseModel):
+ matches: list[CommentMatch] = Field(
+ description="List of matched comment pairs between old and new comments"
+ )
+
+
+class CommentMatchingTool:
+ def __init__(self, llm: BaseChatModel):
+ self.agent = create_agent(
+ llm,
+ system_prompt=SYSTEM_PROMPT,
+ response_format=ProviderStrategy(AgentResponse),
+ )
+
+ def create(**kwargs):
+ if "llm" not in kwargs:
+ from bugbug.tools.core.llms import create_openai_llm
+
+ kwargs["llm"] = create_openai_llm()
+
+ return CommentMatchingTool(**kwargs)
+
+ def run(
+ self, old_comments: list[MatchingComment], new_comments: list[MatchingComment]
+ ) -> list[CommentMatch]:
+ if not old_comments or not new_comments:
+ return []
+
+ response = self.agent.invoke(
+ {
+ "messages": [
+ HumanMessage(
+ FIRST_MESSAGE_TEMPLATE.format(
+ old_comments=old_comments,
+ new_comments=new_comments,
+ )
+ )
+ ]
+ }
+ )
+
+ return response["structured_response"].matches
diff --git a/bugbug/tools/comment_matching/prompts.py b/bugbug/tools/comment_matching/prompts.py
new file mode 100644
index 0000000000..8be398afc1
--- /dev/null
+++ b/bugbug/tools/comment_matching/prompts.py
@@ -0,0 +1,29 @@
+SYSTEM_PROMPT = """You are an expert in code review at Mozilla Firefox.
+
+**Task**:
+
+Match two sets of code review comments to identify redundant comments.
+
+**Instructions**:
+
+ 1. **Consider the following about all comments**:
+ - The comments are related to the same code patch.
+ - The comments may be written in different styles.
+
+ 2. **Understand what each comment is addressing**:
+ - Read the comments in both sets.
+ - Understand the issue that each comment is addressing.
+
+ 3. **Check for matches**:
+ - If you find a comment in the old set that is addressing the same issue as a comment in the new set, link them as redundant.
+ - The comments may not be identical, but they should be addressing the same issue.
+ - The level of detail in the comments may vary.
+"""
+
+FIRST_MESSAGE_TEMPLATE = """**First set of comments (old comments / ground truth)**:
+
+{old_comments}
+
+**Second set of comments (new comments / generated)**:
+
+{new_comments}"""
diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py
index c63102d780..0805e3b25a 100644
--- a/bugbug/tools/core/platforms/base.py
+++ b/bugbug/tools/core/platforms/base.py
@@ -79,7 +79,7 @@ def get_review_request_by_id(self, review_id: int):
raise NotImplementedError
@abstractmethod
- def get_patch_by_id(self, patch_id: str) -> Patch:
+ def get_patch_by_id(self, patch_id: str | int) -> Patch:
raise NotImplementedError
@abstractmethod
diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py
index 8f0defdd57..4122a60060 100644
--- a/bugbug/tools/core/platforms/phabricator.py
+++ b/bugbug/tools/core/platforms/phabricator.py
@@ -497,7 +497,7 @@ def get_review_request_by_id(self, revision_id: int) -> ReviewRequest:
wait=tenacity.wait_exponential(multiplier=2, min=2),
reraise=True,
)
- def get_patch_by_id(self, patch_id: str) -> Patch:
+ def get_patch_by_id(self, patch_id: str | int) -> Patch:
return PhabricatorPatch(patch_id)
def get_all_inline_comments(
diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py
index 0fbf234d96..13cca92fda 100644
--- a/bugbug/tools/core/platforms/swarm.py
+++ b/bugbug/tools/core/platforms/swarm.py
@@ -81,8 +81,8 @@ def __init__(self):
def get_review_request_by_id(self, revision_id: int) -> ReviewRequest:
return ReviewRequest(revision_id)
- def get_patch_by_id(self, patch_id: str) -> Patch:
- return SwarmPatch(patch_id, self.auth)
+ def get_patch_by_id(self, patch_id: str | int) -> Patch:
+ return SwarmPatch(str(patch_id), self.auth)
def get_all_inline_comments(
self, comment_filter
diff --git a/bugbug/tools/patch_summarization/agent.py b/bugbug/tools/patch_summarization/agent.py
new file mode 100644
index 0000000000..9e09935f25
--- /dev/null
+++ b/bugbug/tools/patch_summarization/agent.py
@@ -0,0 +1,48 @@
+from langchain.agents import create_agent
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+
+from bugbug.tools.base import GenerativeModelTool
+from bugbug.tools.code_review.utils import format_patch_set
+from bugbug.tools.core.platforms.base import Patch
+from bugbug.tools.patch_summarization.prompts import PROMPT_TEMPLATE_SUMMARIZATION
+
+
+class PatchSummarizationTool(GenerativeModelTool):
+ def __init__(self, llm: BaseChatModel, target_software: str = "Mozilla Firefox"):
+ self.target_software = target_software
+ self.agent = create_agent(llm)
+
+ def run(self, patch: Patch):
+ result = self.agent.invoke(
+ {
+ "messages": [
+ HumanMessage(
+ PROMPT_TEMPLATE_SUMMARIZATION.format(
+ target_software=self.target_software,
+ patch=format_patch_set(patch.patch_set),
+ bug_title=patch.bug_title,
+ patch_title=patch.patch_title,
+ patch_description=patch.patch_description,
+ )
+ )
+ ]
+ }
+ )
+
+ return result["messages"][-1].content
+
+ @staticmethod
+ def create(**kwargs):
+ """Factory method to instantiate the tool with default dependencies.
+
+ This method takes the same parameters as the constructor, but all
+ parameters are optional. If a parameter is not provided, a default
+ component will be created and used.
+ """
+ if "llm" not in kwargs:
+ from bugbug.tools.core.llms import create_anthropic_llm
+
+ kwargs["llm"] = create_anthropic_llm()
+
+ return PatchSummarizationTool(**kwargs)
diff --git a/bugbug/tools/patch_summarization/prompts.py b/bugbug/tools/patch_summarization/prompts.py
new file mode 100644
index 0000000000..fa0e167c18
--- /dev/null
+++ b/bugbug/tools/patch_summarization/prompts.py
@@ -0,0 +1,25 @@
+PROMPT_TEMPLATE_SUMMARIZATION = """You are an expert reviewer for {target_software}, with experience on source code reviews.
+
+Please, analyze the code provided and report a summarization about the new changes; for that, focus on the coded added represented by lines that start with "+".
+
+The summarization should have two parts:
+ 1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request.
+ 2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent.
+
+Do not include any code in the summarization, only a description of the changes.
+
+**Bug title**:
+
+{bug_title}
+
+
+**Commit message**:
+
+{patch_title}
+{patch_description}
+
+
+**Diff**:
+
+{patch}
+"""
diff --git a/bugbug/tools/suggestion_filtering/agent.py b/bugbug/tools/suggestion_filtering/agent.py
new file mode 100644
index 0000000000..55a0623e0c
--- /dev/null
+++ b/bugbug/tools/suggestion_filtering/agent.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Suggestion filtering tool implementation."""
+
+from typing import Iterable, Optional
+
+from langchain.agents import create_agent
+from langchain.agents.structured_output import ProviderStrategy
+from langchain.chat_models import BaseChatModel
+from langchain.messages import HumanMessage
+from pydantic import BaseModel, Field
+
+from bugbug.tools.base import GenerativeModelTool
+from bugbug.tools.code_review.agent import GeneratedReviewComment
+from bugbug.tools.code_review.database import SuggestionsFeedbackDB
+from bugbug.tools.suggestion_filtering.prompts import (
+ DEFAULT_REJECTED_EXAMPLES,
+ PROMPT_TEMPLATE_FILTERING_ANALYSIS,
+)
+
+
+class FilteredComments(BaseModel):
+ """The response from the filtering agent."""
+
+ comment_indices: list[int] = Field(
+ description="A list of indices of the comments that were kept after filtering"
+ )
+
+
+class SuggestionFilteringTool(GenerativeModelTool):
+ """Tool for filtering generated review comments.
+
+ Filters out low-quality suggestions using an LLM agent with
+ optional dynamic rejected examples from a feedback database.
+ """
+
+ def __init__(
+ self,
+ llm: BaseChatModel,
+ target_software: str = "Mozilla Firefox",
+ suggestions_feedback_db: Optional[SuggestionsFeedbackDB] = None,
+ ) -> None:
+ self.target_software = target_software
+ self.suggestions_feedback_db = suggestions_feedback_db
+ self.agent = create_agent(
+ llm,
+ response_format=ProviderStrategy(FilteredComments),
+ )
+
+ def get_indices_of_retained_comments(
+ self, suggestions: list[GeneratedReviewComment]
+ ) -> list[int]:
+ """Get indices of comments to keep after filtering.
+
+ Args:
+ suggestions: List of generated review comments to filter.
+
+ Returns:
+ List of indices of comments to keep.
+ """
+ if not suggestions:
+ return []
+
+ rejected_examples = self._get_rejected_examples(suggestions)
+
+ formatted_comments = "\n".join(
+ f"Index {i}: {comment.model_dump()}"
+ for i, comment in enumerate(suggestions)
+ )
+
+ prompt = PROMPT_TEMPLATE_FILTERING_ANALYSIS.format(
+ target_code_consistency=self.target_software,
+ comments=formatted_comments,
+ rejected_examples=rejected_examples,
+ )
+
+ result = self.agent.invoke({"messages": [HumanMessage(prompt)]})
+
+ return result["structured_response"].comment_indices
+
+ def run(
+ self, suggestions: list[GeneratedReviewComment]
+ ) -> list[GeneratedReviewComment]:
+ """Filter the given suggestions and return filtered comments.
+
+ Args:
+ suggestions: List of generated review comments to filter.
+
+ Returns:
+ List of filtered GeneratedReviewComment objects.
+ """
+ return [
+ suggestions[i] for i in self.get_indices_of_retained_comments(suggestions)
+ ]
+
+ def _get_rejected_examples(self, suggestions: list[GeneratedReviewComment]) -> str:
+ """Get rejected examples for filtering.
+
+ Uses dynamic examples from feedback database if available,
+ otherwise falls back to default static examples.
+ """
+ if not self.suggestions_feedback_db:
+ return DEFAULT_REJECTED_EXAMPLES
+
+ rejected_comments = list(self._get_similar_rejected_comments(suggestions))
+ if not rejected_comments:
+ return DEFAULT_REJECTED_EXAMPLES
+
+ return "\n - ".join(rejected_comments)
+
+ def _get_similar_rejected_comments(
+ self, suggestions: list[GeneratedReviewComment]
+ ) -> Iterable[str]:
+ """Find similar rejected comments from the feedback database."""
+ if not self.suggestions_feedback_db:
+ raise Exception("Suggestions feedback database is not available")
+
+ num_examples_per_suggestion = 10 // len(suggestions) or 1
+ seen_ids: set[int] = set()
+
+ for suggestion in suggestions:
+ similar_rejected_suggestions = (
+ self.suggestions_feedback_db.find_similar_rejected_suggestions(
+ suggestion.comment,
+ limit=num_examples_per_suggestion,
+ excluded_ids=seen_ids,
+ )
+ )
+ for rejected_suggestion in similar_rejected_suggestions:
+ seen_ids.add(rejected_suggestion.id)
+ yield rejected_suggestion.comment
+
+ @staticmethod
+ def create(**kwargs):
+ """Factory method to instantiate the tool with default dependencies.
+
+ This method takes the same parameters as the constructor, but all
+ parameters are optional. If a parameter is not provided, a default
+ component will be created and used.
+ """
+ if "suggestions_feedback_db" not in kwargs:
+ from bugbug.vectordb import QdrantVectorDB
+
+ kwargs["suggestions_feedback_db"] = SuggestionsFeedbackDB(
+ QdrantVectorDB("suggestions_feedback")
+ )
+
+ if "llm" not in kwargs:
+ from bugbug.tools.core.llms import create_anthropic_llm
+
+ kwargs["llm"] = create_anthropic_llm()
+
+ return SuggestionFilteringTool(**kwargs)
diff --git a/bugbug/tools/suggestion_filtering/prompts.py b/bugbug/tools/suggestion_filtering/prompts.py
new file mode 100644
index 0000000000..8478e3ac17
--- /dev/null
+++ b/bugbug/tools/suggestion_filtering/prompts.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""Prompt templates for suggestion filtering."""
+
+PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that:
+- are consistent with the {target_code_consistency} source code;
+- focus on reporting possible bugs, functional regressions, issues, or similar concerns;
+- report readability or design concerns.
+
+Exclude comments that:
+- only describe the change;
+- restate obvious facts like renamed variables or replaced code;
+- include praising;
+- ask if changes are intentional or ask to ensure things exist.
+
+Comments:
+
+{comments}
+
+
+As examples of not expected comments, not related to the current patch, please, check some below:
+
+ - {rejected_examples}
+
+"""
+
+
+DEFAULT_REJECTED_EXAMPLES = """Please note that these are minor improvements and the overall quality of the patch is good. The documentation is being expanded in a clear and structured way, which will likely be beneficial for future development.
+ - Please note that these are just suggestions and the code might work perfectly fine as it is. It's always a good idea to test all changes thoroughly to ensure they work as expected.
+ - Overall, the patch seems to be well implemented with no major concerns. The developers have made a conscious decision to align with Chrome's behavior, and the reasoning is well documented.
+ - There are no complex code changes in this patch, so there's no potential for major readability regressions or bugs introduced by the changes.
+ - The `focus(...)` method is called without checking if the element and its associated parameters exist or not. It would be better to check if the element exists before calling the `focus()` method to avoid potential errors.
+ - It's not clear if the `SearchService.sys.mjs` file exists or not. If it doesn't exist, this could cause an error. Please ensure that the file path is correct.
+ - This is a good addition to the code."""
diff --git a/experiments/review_helper_modify_filtering_step.ipy b/experiments/review_helper_modify_filtering_step.ipy
deleted file mode 100644
index 26d764a93f..0000000000
--- a/experiments/review_helper_modify_filtering_step.ipy
+++ /dev/null
@@ -1,306 +0,0 @@
-# %%
-%pip install -q bugbug==0.0.570
-
-# %%
-from bugbug import utils
-from bugbug.code_search.mozilla import FunctionSearchMozilla
-from bugbug.tools.code_review.utils import generate_processed_output, parse_model_output
-
-
-def get_file(commit_hash, path):
- r = utils.get_session("hgmo").get(
- f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}",
- headers={
- "User-Agent": utils.get_user_agent(),
- },
- )
- r.raise_for_status()
- return r.text
-
-
-repo_dir = "../mozilla-unified"
-function_search = FunctionSearchMozilla(repo_dir, get_file, True)
-
-
-# %%
-from dotenv import load_dotenv
-
-from bugbug.generative_model_tool import create_openai_llm
-
-load_dotenv("~/.env")
-
-llm = create_openai_llm()
-
-# %%
-from bugbug.tools import code_review
-from bugbug.vectordb import QdrantVectorDB
-
-review_comments_db = code_review.ReviewCommentsDB(QdrantVectorDB("diff_comments"))
-
-
-suggestions_feedback_db = code_review.SuggestionsFeedbackDB(
- QdrantVectorDB("suggestions_feedback")
-)
-
-review_platform = "phabricator"
-review_data: code_review.ReviewData = code_review.review_data_classes[review_platform]()
-
-
-# %%
-tool = code_review.CodeReviewTool(
- llm=llm,
- # function_search=function_search,
- review_comments_db=review_comments_db,
- suggestions_feedback_db=suggestions_feedback_db,
- verbose=False,
-)
-
-
-# %%
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-
-partial_variables = {
- "target_code_consistency": "Mozilla Firefox",
-}
-
-current_filtering_chain = LLMChain(
- prompt=PromptTemplate.from_template(
- code_review.PROMPT_TEMPLATE_FILTERING_ANALYSIS,
- partial_variables=partial_variables,
- ),
- llm=llm,
-)
-
-
-# %%
-from scripts.code_review_tool_evaluator import (
- FeedbackEvaluator,
-)
-
-evaluator = FeedbackEvaluator("../bugbug_evaluation_datasets/evaluated_comments.csv")
-
-
-# %%
-import pandas as pd
-
-
-def get_latest_evaluation_results_file():
- import glob
-
- files = glob.glob("evaluation_results_*#*.csv")
- if not files:
- raise FileNotFoundError(
- "No evaluation results file found. The file name should have `#` followed by a PR number."
- )
-
- return max(files)
-
-
-# SELECTION (Option 1: same as before)
-selected_review_requests = [
- (revision_id, code_review.ReviewRequest(diff_id))
- for revision_id, diff_id in pd.read_csv(get_latest_evaluation_results_file())[
- ["revision_id", "diff_id"]
- ]
- .drop_duplicates()
- .itertuples(name=None, index=False)
-]
-
-# SELECTION (Option 2: random sample)
-# selected_review_requests = [
-# (revision_id, code_review.ReviewRequest(diff_id))
-# for revision_id, diff_id in evaluator.evaluated_comments
-# # .query("evaluation == 'CORRECT'")
-# [["revision_id", "diff_id"]].drop_duplicates().sample(100).itertuples(index=False)
-# ]
-
-
-# %%
-
-
-def common_step():
- for review_request_id, review_request in selected_review_requests:
- print("---------------------------------------------------------")
- print(f"Review Request ID: {review_request_id}")
- print(f"Patch ID: {review_request.patch_id}")
- patch = review_data.get_patch_by_id(review_request.patch_id)
- print("---------------------------------------------------------")
-
- try:
- output = tool._generate_suggestions(patch)
- except code_review.FileNotInPatchError as e:
- print("Error while running the tool:", e)
- continue
- except code_review.LargeDiffError:
- print("Skipping the patch because it is too large.")
- continue
-
- unfiltered_suggestions = parse_model_output(output)
- if not unfiltered_suggestions:
- print("No suggestions were generated")
- continue
-
- rejected_examples = (
- "\n - ".join(tool.get_similar_rejected_comments(unfiltered_suggestions))
- if tool.suggestions_feedback_db
- else code_review.DEFAULT_REJECTED_EXAMPLES
- )
-
- raw_output = current_filtering_chain.invoke(
- {
- "review": output,
- "patch": patch.raw_diff,
- "rejected_examples": rejected_examples,
- },
- return_only_outputs=True,
- )["text"]
-
- try:
- comments = list(
- generate_processed_output(raw_output, patch.patch_set)
- )
- except code_review.FileNotInPatchError as e:
- print("Error while running the tool:", e)
- continue
-
- # print_prettified_comments(comments)
- evaluation = evaluator.evaluate_diff_comments(review_request.patch_id, comments)
- evaluation_for_original = [
- {
- "variant_name": "Current Filtering",
- "revision_id": review_request_id,
- "diff_id": review_request.patch_id,
- **row,
- }
- for row in evaluation
- ]
-
- yield (
- rejected_examples,
- output,
- patch,
- review_request_id,
- review_request,
- evaluation_for_original,
- )
-
-
-common_step_results = list(common_step())
-
-# %%
-import pickle
-
-if "common_step_results" not in locals():
- raise ValueError(
- "common_step_results is not defined. Please run the common_step() function first."
- )
-
-
-with open("common_step_results.pkl", "wb") as f:
- pickle.dump(common_step_results, f)
-
-# %%
-import pickle
-
-with open("common_step_results.pkl", "rb") as f:
- common_step_results = pickle.load(f)
-
-# %%
-from datetime import datetime
-
-NEW__PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that:
-- are consistent with the {target_code_consistency} source code;
-- focus on reporting possible bugs, functional regressions, issues, or similar concerns;
-- report readability or design concerns.
-
-Exclude comments that:
-- only describe the change;
-- restate obvious facts like renamed variables or replaced code;
-- include praising;
-- ask if changes are intentional or ask to ensure things exist.
-
-Do not report any explanation about your choice. Only return a valid JSON list.
-
-Comments:
-{comments}
-
-As examples of not expected comments, not related to the current patch, please, check some below:
- - {rejected_examples}
-"""
-
-new_filtering_chain = LLMChain(
- prompt=PromptTemplate.from_template(
- NEW__PROMPT_TEMPLATE_FILTERING_ANALYSIS,
- partial_variables=partial_variables,
- ),
- llm=llm,
-)
-
-
-all_variants_evaluation_results = []
-
-for (
- rejected_examples,
- output,
- patch,
- review_request_id,
- review_request,
- evaluation_for_original,
-) in common_step_results:
-
- all_variants_evaluation_results.extend(evaluation_for_original)
-
- raw_output = new_filtering_chain.invoke(
- {
- "comments": output,
- "rejected_examples": rejected_examples,
- },
- return_only_outputs=True,
- )["text"]
-
- comments = list(generate_processed_output(raw_output, patch.patch_set))
-
- # print_prettified_comments(comments)
- evaluation = evaluator.evaluate_diff_comments(review_request.patch_id, comments)
- all_variants_evaluation_results.extend(
- {
- "variant_name": "New Filtering",
- "revision_id": review_request_id,
- "diff_id": review_request.patch_id,
- **row,
- }
- for row in evaluation
- )
-
- evaluation_result_all_columns = [
- "variant_name",
- "revision_id",
- "diff_id",
- "new_comment",
- "old_comments_count",
- "matched",
- "old_comment",
- "evaluation",
- ]
-
-
-evaluation_results_file = (
- f"evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
-)
-
-
-df = pd.DataFrame(
- all_variants_evaluation_results, columns=evaluation_result_all_columns
-)
-df.to_csv(
- evaluation_results_file,
- index=False,
- header=True,
- mode="w",
-)
-
-
-# %%
-print("Done!")
-# %%
diff --git a/notebooks/code_review_create_dataset.ipynb b/notebooks/code_review_create_dataset.ipynb
new file mode 100644
index 0000000000..96cdf91e42
--- /dev/null
+++ b/notebooks/code_review_create_dataset.ipynb
@@ -0,0 +1,193 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "af4f2030",
+ "metadata": {},
+ "source": [
+ "# Create Dataset for Code Review Evaluation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0f37fd7",
+ "metadata": {},
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5f57608e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import weave\n",
+ "\n",
+ "PROJECT_NAME = \"bugbug-code-review-eval\"\n",
+ "DATASET_NAME = \"code_review_eval\"\n",
+ "\n",
+ "_ = weave.init(PROJECT_NAME)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "255ccaac",
+ "metadata": {},
+ "source": [
+ "## Prepare the Data\n",
+ "\n",
+ "### Load the feedback data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5bd97ef4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_csv(\n",
+ " \"https://docs.google.com/spreadsheets/d/e/2PACX-1vTQLbFpQmzEv4zxaKJprosIt6RWs4jDAN6IDT6xB7EcelD0ZR8qilt2tpscUSGEwMNWVpHKDlPacA7b/pub?output=csv\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e6f4f74a",
+ "metadata": {},
+ "source": [
+ "### Reuse summaries from the last dataset version:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eb3fee95",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "summaries = {\n",
+ " row[\"diff_id\"]: row[\"patch_summary\"]\n",
+ " for row in weave.ref(DATASET_NAME).get()\n",
+ " if row.get(\"patch_summary\")\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e93f09e",
+ "metadata": {},
+ "source": [
+ "## Generate Summaries\n",
+ "\n",
+ "Generate patch summaries for new diffs that do not have summaries yet."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "03accee8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData\n",
+ "from bugbug.tools.patch_summarization.agent import PatchSummarizationTool\n",
+ "\n",
+ "summarizer = PatchSummarizationTool.create()\n",
+ "review_data = PhabricatorReviewData()\n",
+ "\n",
+ "\n",
+ "@weave.op()\n",
+ "def summarize(diff_id):\n",
+ " patch = review_data.get_patch_by_id(diff_id)\n",
+ " summaries[diff_id] = summarizer.run(patch)\n",
+ "\n",
+ "\n",
+ "@weave.op()\n",
+ "def generate_summaries(diff_ids):\n",
+ " with weave.ThreadPoolExecutor() as exc:\n",
+ " exc.map(summarize, diff_ids)\n",
+ "\n",
+ "\n",
+ "diff_ids_with_no_summaries = (\n",
+ " df.query(\"diff_id not in @summaries\")[\"diff_id\"].unique().tolist()\n",
+ ")\n",
+ "\n",
+ "generate_summaries(diff_ids_with_no_summaries)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3c4f93a4",
+ "metadata": {},
+ "source": [
+ "## Save the Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5041136",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "examples = [\n",
+ " {\n",
+ " \"diff_id\": diff_id,\n",
+ " \"revision_id\": revision_id,\n",
+ " \"patch_summary\": summaries[diff_id],\n",
+ " \"ground_truth_comments\": ground_truth_comments,\n",
+ " }\n",
+ " for (diff_id, revision_id), ground_truth_comments in df.groupby(\n",
+ " [\"diff_id\", \"revision_id\"]\n",
+ " )\n",
+ " .apply(lambda x: x.to_dict(orient=\"records\"))\n",
+ " .to_dict()\n",
+ " .items()\n",
+ " if diff_id in summaries\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d132783a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = weave.Dataset(\n",
+ " name=DATASET_NAME,\n",
+ " description=\"Code review evaluation dataset with ground truth comments and patch summaries.\",\n",
+ " rows=examples,\n",
+ ")\n",
+ "\n",
+ "_ = weave.publish(dataset)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "bugbug (3.12.7)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/code_review_evaluation.ipynb b/notebooks/code_review_evaluation.ipynb
new file mode 100644
index 0000000000..3291d491e8
--- /dev/null
+++ b/notebooks/code_review_evaluation.ipynb
@@ -0,0 +1,234 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Code Review Tool Evaluation\n",
+ "\n",
+ "This notebook runs W&B Weave evaluations for the code review tool."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import weave\n",
+ "\n",
+ "PROJECT_NAME = \"bugbug-code-review-eval\"\n",
+ "\n",
+ "_ = weave.init(PROJECT_NAME)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset = weave.ref(\"code_review_eval_legacy\").get()\n",
+ "\n",
+ "print(f\"Dataset has {len(dataset.rows)} examples\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Configure Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from functools import cached_property\n",
+ "\n",
+ "from bugbug.tools.code_review.agent import CodeReviewTool\n",
+ "\n",
+ "\n",
+ "class CodeReviewModel(weave.Model):\n",
+ " \"\"\"Weave Model wrapper for CodeReviewTool.\"\"\"\n",
+ "\n",
+ " @cached_property\n",
+ " def tool(self):\n",
+ " return CodeReviewTool.create()\n",
+ "\n",
+ " @weave.op()\n",
+ " def invoke(self, diff_id: int, patch_summary: str) -> dict:\n",
+ " patch = self.tool.review_data.get_patch_by_id(diff_id)\n",
+ " try:\n",
+ " comments = self.tool.generate_review_comments(patch, patch_summary)\n",
+ " return {\n",
+ " \"comments\": comments,\n",
+ " \"error\": None,\n",
+ " }\n",
+ " except Exception as e:\n",
+ " return {\n",
+ " \"comments\": [],\n",
+ " \"error\": str(e),\n",
+ " }\n",
+ "\n",
+ "\n",
+ "model = CodeReviewModel()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from bugbug.tools.code_review.scorer import BasicMetricsScorer, LLMCommentMatchingScorer\n",
+ "\n",
+ "evaluation = weave.Evaluation(\n",
+ " dataset=dataset,\n",
+ " scorers=[BasicMetricsScorer(), LLMCommentMatchingScorer()],\n",
+ ")\n",
+ "\n",
+ "results = await evaluation.evaluate(model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Visualizations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# Extract key metrics for visualization\n",
+ "metrics = {\n",
+ " \"Recall (Valid)\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n",
+ " \"recall_valid\", 0\n",
+ " ),\n",
+ " \"Recall (Invalid)\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n",
+ " \"recall_invalid\", 0\n",
+ " ),\n",
+ " \"Missed Valid Rate\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n",
+ " \"missed_valid_rate\", 0\n",
+ " ),\n",
+ " \"Error Rate\": results.get(\"BasicMetricsScorer\", {}).get(\"error_rate\", 0),\n",
+ "}\n",
+ "\n",
+ "# Bar chart\n",
+ "fig, ax = plt.subplots(figsize=(10, 6))\n",
+ "bars = ax.bar(\n",
+ " metrics.keys(), metrics.values(), color=[\"green\", \"orange\", \"red\", \"gray\"]\n",
+ ")\n",
+ "ax.set_ylabel(\"Rate\")\n",
+ "ax.set_title(\"Code Review Evaluation Metrics\")\n",
+ "ax.set_ylim(0, 1)\n",
+ "\n",
+ "# Add value labels on bars\n",
+ "for bar, value in zip(bars, metrics.values()):\n",
+ " ax.text(\n",
+ " bar.get_x() + bar.get_width() / 2,\n",
+ " bar.get_height() + 0.02,\n",
+ " f\"{value:.2%}\",\n",
+ " ha=\"center\",\n",
+ " va=\"bottom\",\n",
+ " )\n",
+ "\n",
+ "plt.xticks(rotation=45, ha=\"right\")\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Comment counts comparison\n",
+ "basic_metrics = results.get(\"BasicMetricsScorer\", {})\n",
+ "\n",
+ "counts = {\n",
+ " \"Generated\": basic_metrics.get(\"total_generated_comments\", 0),\n",
+ " \"Ground Truth (Valid)\": basic_metrics.get(\"total_ground_truth_valid\", 0),\n",
+ " \"Ground Truth (Invalid)\": basic_metrics.get(\"total_ground_truth_invalid\", 0),\n",
+ "}\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(8, 5))\n",
+ "bars = ax.bar(counts.keys(), counts.values(), color=[\"blue\", \"green\", \"red\"])\n",
+ "ax.set_ylabel(\"Count\")\n",
+ "ax.set_title(\"Comment Counts\")\n",
+ "\n",
+ "for bar, value in zip(bars, counts.values()):\n",
+ " ax.text(\n",
+ " bar.get_x() + bar.get_width() / 2,\n",
+ " bar.get_height() + 0.5,\n",
+ " str(int(value)),\n",
+ " ha=\"center\",\n",
+ " va=\"bottom\",\n",
+ " )\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 7. View in W&B\n",
+ "\n",
+ "Visit [W&B Weave](https://wandb.ai) to see detailed traces, compare evaluations, and explore individual predictions."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "bugbug (3.12.7)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/requirements.txt b/requirements.txt
index c3d5603aac..9e6870eb74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -41,5 +41,6 @@ taskcluster==94.1.1
tenacity==9.1.2
tqdm==4.67.1
unidiff==0.7.5
+weave>=0.50.0
xgboost==3.1.2
zstandard==0.25.0
diff --git a/scripts/code_review_tool_evaluator.py b/scripts/code_review_tool_evaluator.py
deleted file mode 100644
index 085035d834..0000000000
--- a/scripts/code_review_tool_evaluator.py
+++ /dev/null
@@ -1,613 +0,0 @@
-# -*- coding: utf-8 -*-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-"""This script evaluates different variants of the code review tool.
-
-Before running this script, you may need to set the following environment
-variables:
- - BUGBUG_PHABRICATOR_URL
- - BUGBUG_PHABRICATOR_TOKEN
- - BUGBUG_*_API_KEY (replace * with your LLM provider)
- - BUGBUG_QDRANT_API_KEY
- - BUGBUG_QDRANT_LOCATION
-
-To specify different variants to evaluate, please modify the get_tool_variants
-function.
-"""
-
-import os
-from collections import defaultdict
-from datetime import datetime, timedelta
-
-import numpy as np
-import pandas as pd
-from langchain_classic.prompts import PromptTemplate
-from langchain_core.runnables import RunnableSequence
-from tabulate import tabulate
-
-from bugbug import db, phabricator, utils
-from bugbug.code_search.mozilla import FunctionSearchMozilla
-from bugbug.tools import code_review
-from bugbug.tools.code_review.utils import parse_model_output
-from bugbug.tools.core import llms
-from bugbug.tools.core.exceptions import ModelResultError
-from bugbug.vectordb import QdrantVectorDB
-
-VERBOSE_CODE_REVIEW = False
-
-
-EVALUATION_TEMPLATE = """Your are an expert in code review at Mozilla Firefox.
-
-**Task**:
-
-Match two sets of code review comments to identify redundant comments.
-
-**Instructions**:
-
- 1. **Consider the following about all comments**:
- - The comments are related to the same code patch.
- - The comments may be written in different styles.
-
- 2. **Understand what each comment is addressing**:
- - Read the comments in both sets.
- - Understand the issue that each comment is addressing.
-
- 3. **Check for matches**:
- - If you find a comment in the old set that is addressing the same issue as a comment in the new set, link them as redundant.
- - The comments may not be identical, but they should be addressing the same issue.
- - The level of detail in the comments may vary.
-
- 4. **Output format**:
- - Output a list of matched comments.
- - Include the comment IDs only in the output.
- - Each element in the list should be an object with two keys: `old_comment_id` and `new_comment_id`.
- - No explanation is needed in the output, only the IDs of the matched comments.
- - The output should be a valid json only.
-
-
-**Output example**:
-
- [
- {{"old_comment_id": 1, "new_comment_id": 3}},
- {{"old_comment_id": 4, "new_comment_id": 2}},
- ]
-
-**First set of comments (old comments)**:
-
-{old_comments}
-
-**Second set of comments (new comments)**:
-
-{new_comments}
-"""
-
-
-class FeedbackEvaluator:
- def __init__(self, evaluation_dataset: str):
- self.evaluated_comments = pd.read_csv(evaluation_dataset)
-
- llm = llms.create_openai_llm()
- evaluate_comments_prompt = PromptTemplate.from_template(EVALUATION_TEMPLATE)
- self.evaluation_chain = RunnableSequence(evaluate_comments_prompt, llm)
-
- def evaluate_diff_comments(
- self,
- diff_id: int,
- new_comments: list[code_review.InlineComment],
- ) -> list[dict]:
- diff_evaluated_comments = self.evaluated_comments[
- self.evaluated_comments["diff_id"] == diff_id
- ].reset_index()
- diff_evaluated_comments["evaluation"] = np.where(
- diff_evaluated_comments["evaluation"].isin(["CORRECT", "VALID_REDUNDANT"]),
- "VALID",
- "INVALID",
- )
-
- output = self.evaluation_chain.invoke(
- {
- "old_comments": [
- {
- "id": i,
- "content": raw["comment"],
- "file": raw["file_path"],
- }
- for i, raw in diff_evaluated_comments.iterrows()
- ],
- "new_comments": [
- {
- "id": i,
- "content": comment.content,
- "file": comment.filename,
- }
- for i, comment in enumerate(new_comments)
- ],
- }
- )
-
- matches = parse_model_output(output.content)
-
- results = [
- {
- "new_comment": comment.content,
- "new_comment_order": comment.order,
- "old_comments_count": 0,
- "matched": False,
- }
- for comment in new_comments
- ]
- seen_old_comments = set()
-
- for match in matches:
- old_index = match["old_comment_id"]
- new_index = match["new_comment_id"]
-
- evaluated_comment = diff_evaluated_comments.iloc[old_index]
- new_comment = new_comments[new_index]
-
- if evaluated_comment["file_path"] != new_comment.filename:
- print(
- "File mismatch:",
- evaluated_comment["file_path"],
- new_comment.filename,
- )
- continue
-
- current_result = results[new_index]
-
- current_result["evaluation"] = (
- "MIXED"
- if (
- "evaluation" in current_result
- and current_result["evaluation"] != evaluated_comment["evaluation"]
- )
- else evaluated_comment["evaluation"]
- )
-
- if "old_comment" in current_result:
- current_result["old_comment"] += (
- f"\n\n-------------\n\n{evaluated_comment['comment']}"
- )
- else:
- current_result["old_comment"] = evaluated_comment["comment"]
-
- current_result["old_comments_count"] += 1
- current_result["matched"] = True
-
- seen_old_comments.add(old_index)
-
- for i, raw in diff_evaluated_comments.iterrows():
- if i in seen_old_comments:
- continue
-
- results.append(
- {
- "old_comment": raw["comment"],
- "evaluation": raw["evaluation"],
- "old_comments_count": 1,
- }
- )
-
- self.print_evaluation_matches(results)
-
- return results
-
- @staticmethod
- def print_evaluation_matches(matching_results: list[dict]):
- print(
- tabulate(
- [
- (
- result.get("new_comment", ""),
- result.get("old_comment", ""),
- result.get("evaluation", ""),
- )
- for result in matching_results
- ],
- tablefmt="mixed_grid",
- headers=[
- "New Comment",
- "Old Comment",
- "Evaluation",
- ],
- maxcolwidths=[
- 60,
- 60,
- 20,
- ],
- )
- )
-
-
-def get_tool_variants(
- variants: list[str],
-) -> list[tuple[str, code_review.CodeReviewTool]]:
- """Returns a list of tool variants to evaluate.
-
- Returns:
- List of tuples, where each tuple contains the name of the variant and
- and instance of the code review tool to evaluate.
- """
- # Step 1: we start with instantiating the dependencies based on the selected
- # variants.
- repo_dir = "../mozilla-unified"
-
- def get_file(commit_hash, path):
- r = utils.get_session("hgmo").get(
- f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}",
- headers={
- "User-Agent": utils.get_user_agent(),
- },
- )
- r.raise_for_status()
- return r.text
-
- function_search = FunctionSearchMozilla(repo_dir, get_file, True)
-
- review_comments_db = code_review.ReviewCommentsDB(QdrantVectorDB("diff_comments"))
-
- suggestions_feedback_db = code_review.SuggestionsFeedbackDB(
- QdrantVectorDB("suggestions_feedback")
- )
-
- # Step 2: we create the selected tool variants.
-
- tool_variants = []
-
- if "claude" in variants:
- tool_variants.append(
- (
- "Claude",
- code_review.CodeReviewTool.create(
- function_search=function_search,
- review_comments_db=review_comments_db,
- suggestions_feedback_db=suggestions_feedback_db,
- verbose=VERBOSE_CODE_REVIEW,
- ),
- )
- )
-
- if "gpt" in variants:
- llm = llms.create_openai_llm()
- tool_variants.append(
- (
- "GPT",
- code_review.CodeReviewTool.create(
- llm=llm,
- summarization_llm=llm,
- filtering_llm=llm,
- function_search=function_search,
- review_comments_db=review_comments_db,
- suggestions_feedback_db=suggestions_feedback_db,
- verbose=VERBOSE_CODE_REVIEW,
- ),
- )
- )
-
- return tool_variants
-
-
-def get_review_requests_sample(since: timedelta, limit: int):
- assert db.download(phabricator.REVISIONS_DB)
-
- start_date = (datetime.now() - since).timestamp()
- MOZILLA_CENTRAL_PHID = "PHID-REPO-saax4qdxlbbhahhp2kg5"
-
- n = 0
- for review_request in phabricator.get_revisions():
- if (
- review_request["fields"]["repositoryPHID"] != MOZILLA_CENTRAL_PHID
- or review_request["fields"]["dateCreated"] <= start_date
- ):
- continue
-
- if n >= limit >= 0:
- break
-
- yield review_request["id"]
- n += 1
-
-
-def print_prettified_comments(comments: list[code_review.InlineComment]):
- if not comments:
- print("No comments to show.")
- return
-
- print(
- tabulate(
- [
- (
- comment.filename,
- comment.end_line,
- comment.content,
- )
- for comment in comments
- ],
- headers=[
- "File",
- "Line",
- "Comment",
- ],
- maxcolwidths=[
- 30,
- 10,
- 100,
- ],
- ),
- )
-
-
-def get_latest_evaluation_results_file(results_dir: str | None):
- import glob
- import os
-
- files = glob.glob("evaluation_results_*#*.csv", root_dir=results_dir)
- if not files:
- raise FileNotFoundError("No evaluation results file found.")
-
- latests_files = max(files)
- if results_dir:
- return os.path.join(results_dir, latests_files)
-
- return latests_files
-
-
-def get_ongoing_evaluation_results_file(results_dir: str | None):
- import glob
- import os
-
- base_file = get_latest_evaluation_results_file(results_dir)
- files = [
- file
- for file in glob.glob("evaluation_results_*.csv", root_dir=results_dir)
- if "#" not in file and file > base_file
- ]
- if not files:
- raise FileNotFoundError("No ongoing evaluation results file found.")
-
- latests_file = max(files)
- if results_dir:
- return os.path.join(results_dir, latests_file)
-
- return latests_file
-
-
-def main(args):
- review_platform = "phabricator"
- review_data: code_review.ReviewData = code_review.review_data_classes[
- review_platform
- ]()
-
- tool_variants = get_tool_variants(args.variants)
-
- evaluator = FeedbackEvaluator(args.evaluation_dataset)
-
- result_file = os.path.join(
- args.results_dir,
- "code_review_tool_evaluator.csv",
- )
- is_first_result = not os.path.exists(result_file)
-
- if is_first_result:
- evaluation_results_file = os.path.join(
- args.results_dir,
- f"evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv",
- )
- seen_patches = set()
- else:
- evaluation_results_file = get_ongoing_evaluation_results_file(args.results_dir)
- seen_patches = set(pd.read_csv(evaluation_results_file)["diff_id"].to_list())
-
- result_unique_columns = ["Review Request ID", "File", "Line", "Comment Number"]
- result_all_columns = result_unique_columns + [
- f"{title} ({variant_name})"
- for variant_name, _ in tool_variants
- for title in ("Comment", "Evaluation")
- ]
- evaluation_result_all_columns = [
- "variant_name",
- "revision_id",
- "diff_id",
- "new_comment",
- "new_comment_order",
- "old_comments_count",
- "matched",
- "old_comment",
- "evaluation",
- ]
-
- selected_review_requests = []
- if args.diff_ids:
- selected_review_requests = (
- ("n/a", code_review.ReviewRequest(diff_id)) for diff_id in args.diff_ids
- )
- elif args.review_request_ids:
- selected_review_requests = (
- (review_request_id, review_data.get_review_request_by_id(review_request_id))
- for review_request_id in args.review_request_ids
- )
- elif args.evaluation_strategy == "random":
- print("No review request IDs specified. Selecting a random sample.")
- selected_review_requests = (
- (revision_id, code_review.ReviewRequest(diff_id))
- for revision_id, diff_id in evaluator.evaluated_comments.query(
- "evaluation == 'CORRECT'"
- )[["revision_id", "diff_id"]]
- .drop_duplicates()
- .sample(20)
- .itertuples(index=False)
- )
- elif args.evaluation_strategy == "same":
- selected_review_requests = (
- (revision_id, code_review.ReviewRequest(diff_id))
- for revision_id, diff_id in pd.read_csv(
- get_latest_evaluation_results_file(args.results_dir),
- )[["revision_id", "diff_id"]]
- .drop_duplicates()
- .itertuples(name=None, index=False)
- )
- else:
- raise ValueError(
- "Please specify either --diff-id or --revision-id. Alternatively, use --evaluation-strategy."
- )
-
- for review_request_id, review_request in selected_review_requests:
- if review_request_id in [227266, 233414]:
- print(
- f"Skipping Review Request ID {review_request_id} because it is known to cause issues."
- )
- continue
-
- if review_request.patch_id in seen_patches:
- print(
- f"Skipping Review Request ID {review_request_id} (Diff ID {review_request.patch_id}) because it was already evaluated."
- )
- continue
-
- print("---------------------------------------------------------")
- print(f"Review Request ID: {review_request_id}")
- print(f"Patch ID: {review_request.patch_id}")
- patch = review_data.get_patch_by_id(review_request.patch_id)
- print("---------------------------------------------------------")
-
- if len(patch.raw_diff) > 20_000:
- print("Skipping the patch because it is too large.")
- continue
-
- all_variants_results = []
- all_variants_evaluation_results = []
- for variant_name, tool in tool_variants:
- print(f"\n\nVariant: {variant_name}\n")
- try:
- comments = tool.run(patch)
- except code_review.FileNotInPatchError as e:
- print("Error while running the tool:", e)
- continue
- except code_review.LargeDiffError:
- print("Skipping the patch because it is too large.")
- continue
- except ModelResultError as e:
- print("Error while running the tool:", e)
- continue
-
- print_prettified_comments(comments)
- comment_per_line_counter = defaultdict(int)
-
- evaluation = evaluator.evaluate_diff_comments(
- review_request.patch_id, comments
- )
-
- all_variants_evaluation_results.extend(
- {
- "variant_name": variant_name,
- "revision_id": review_request_id,
- "diff_id": review_request.patch_id,
- **row,
- }
- for row in evaluation
- )
-
- for i, comment in enumerate(comments):
- key = (review_request_id, comment.filename, comment.end_line)
- comment_per_line_counter[key] += 1
-
- all_variants_results.append(
- {
- "Review Request ID": review_request_id,
- "File": comment.filename,
- "Line": comment.end_line,
- "Comment Number": comment_per_line_counter[key],
- f"Comment ({variant_name})": comment.content,
- f"Evaluation ({variant_name})": evaluation[i].get("evaluation"),
- }
- )
-
- df = (
- pd.DataFrame(all_variants_results, columns=result_all_columns)
- .groupby(result_unique_columns)
- .first()
- )
- df.to_csv(
- result_file,
- header=is_first_result,
- mode="w" if is_first_result else "a",
- )
-
- df = pd.DataFrame(
- all_variants_evaluation_results, columns=evaluation_result_all_columns
- )
- df.to_csv(
- evaluation_results_file,
- index=False,
- header=is_first_result,
- mode="w" if is_first_result else "a",
- )
-
- if is_first_result:
- is_first_result = False
- print("You can find the results in the file:", result_file)
-
- print("\n\n\n")
-
-
-if __name__ == "__main__":
- import argparse
-
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
- )
- parser.add_argument(
- "-r",
- "--revision-id",
- dest="review_request_ids",
- action="append",
- help="if specified, run only the selected Revision ID(s)",
- metavar="REVISION_ID",
- type=int,
- )
- parser.add_argument(
- "-d",
- "--diff-id",
- dest="diff_ids",
- action="append",
- help="if specified, run only the selected Diff ID(s)",
- metavar="DIFF_ID",
- type=int,
- )
- parser.add_argument(
- "--evaluation-data",
- dest="evaluation_dataset",
- action="store",
- help="the path or the URL to a evaluation dataset in CSV format",
- )
-
- parser.add_argument(
- "--results-dir",
- dest="results_dir",
- action="store",
- help="the directory to store the results and read previous results",
- )
-
- parser.add_argument(
- "--evaluation-strategy",
- dest="evaluation_strategy",
- action="store",
- help="the evaluation strategy to use",
- )
- parser.add_argument(
- "--variant",
- dest="variants",
- action="append",
- help="the variants to use, use multiple times for multiple variants",
- choices=["claude", "gpt"],
- required=True,
- )
-
- args = parser.parse_args()
-
- if args.diff_ids and args.review_request_ids:
- parser.error("Please specify either --diff-id or --revision-id, not both.")
-
- main(args)
diff --git a/scripts/code_review_tool_evaluator_report.py b/scripts/code_review_tool_evaluator_report.py
deleted file mode 100644
index 801be5bcb4..0000000000
--- a/scripts/code_review_tool_evaluator_report.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# %%
-
-import pandas as pd
-
-import scripts.code_review_tool_evaluator as evaluator_script
-
-evaluation_results = pd.read_csv(
- # evaluator_script.get_latest_evaluation_results_file("../evaluation_results")
- evaluator_script.get_ongoing_evaluation_results_file("../evaluation_results")
-)
-
-# %%
-
-variant_names = evaluation_results["variant_name"].unique()
-variant_name = variant_names[0]
-
-df = evaluation_results[evaluation_results["variant_name"] == variant_name]
-
-
-# %%
-new_comments_count = df["new_comment"].count()
-new_valid_comments = len(df[~df["new_comment"].isna() & (df["evaluation"] == "VALID")])
-new_invalid_comments = len(
- df[~df["new_comment"].isna() & (df["evaluation"] == "INVALID")]
-)
-new_unevaluated_comments = len(df[~df["new_comment"].isna() & df["evaluation"].isna()])
-
-old_comments_count = df["old_comments_count"].sum()
-old_valid_comments = df[df["evaluation"] == "VALID"]["old_comments_count"].sum()
-old_invalid_comments = df[df["evaluation"] == "INVALID"]["old_comments_count"].sum()
-
-matched_valid_comments = df[
- ~df["new_comment"].isna()
- & ~df["old_comment"].isna()
- & (df["evaluation"] == "VALID")
-]["old_comments_count"].sum()
-matched_invalid_comments = df[
- ~df["new_comment"].isna()
- & ~df["old_comment"].isna()
- & (df["evaluation"] == "INVALID")
-]["old_comments_count"].sum()
-
-print("--------------------")
-print("Variant Name:", variant_name)
-print("--------------------")
-print("New Comments:", new_comments_count)
-print("New Valid Comments:", new_valid_comments)
-print("New Invalid Comments:", new_invalid_comments)
-print("New Unevaluated Comments:", new_unevaluated_comments)
-print("--------------------")
-print("Old Comments:", old_comments_count)
-print("Old Valid Comments:", old_valid_comments)
-print("Old Invalid Comments:", old_invalid_comments)
-print("--------------------")
-print(
- "Recalled comments:",
- (matched_valid_comments + matched_invalid_comments) / old_comments_count * 100,
-)
-print("Recalled valid comments:", matched_valid_comments / old_valid_comments * 100)
-print(
- "Recalled invalid comments:", matched_invalid_comments / old_invalid_comments * 100
-)
-print("--------------------")
-print(
- "Missed valid comments:",
- (old_valid_comments - matched_valid_comments) / old_valid_comments * 100,
-)
-print(
- "Missed invalid comments:",
- (old_invalid_comments - matched_invalid_comments) / old_invalid_comments * 100,
-)
-
-
-# %%
-
-
-df = evaluation_results[
- evaluation_results["evaluation"].isin(["VALID", "INVALID"])
- & ~evaluation_results["new_comment"].isna()
-].sort_values(by=["evaluation", "revision_id", "new_comment"])
-
-
-df["id"] = df["diff_id"].astype(str) + " | " + df["new_comment"].astype(str)
-df_old = df[df["variant_name"] == variant_names[0]]
-df_new = df[df["variant_name"] == variant_names[1]]
-
-in_new_but_not_in_old = df_new[~df_new["id"].isin(df_old["id"])]
-
-print(
- "Examples of comments that were filtered by the old version but were not filtered by the new version:\n"
-)
-print(
- in_new_but_not_in_old[["revision_id", "new_comment", "evaluation"]].to_markdown(
- index=False
- )
-)
-
-
-in_old_but_not_in_new = df_old[~df_old["id"].isin(df_new["id"])]
-print(
- "\n\nExamples of comments that were filtered by the new version but were not filtered by the old version:\n"
-)
-print(
- in_old_but_not_in_new[["revision_id", "new_comment", "evaluation"]].to_markdown(
- index=False
- )
-)
-
-# %%