diff --git a/bugbug/code_search/mozilla.py b/bugbug/code_search/mozilla.py index b80c25db0a..1c83af4578 100644 --- a/bugbug/code_search/mozilla.py +++ b/bugbug/code_search/mozilla.py @@ -10,10 +10,10 @@ class FunctionSearchMozilla(FunctionSearch): - def __init__(self, repo_dir, get_file, fast=False): + def __init__(self, repo_dir, get_file=None, fast=False): super().__init__() self.repo_dir = repo_dir - self.get_file = get_file + self.get_file = get_file or FunctionSearchSearchfoxAPI._get_file self.fast = fast def get_function( diff --git a/bugbug/code_search/searchfox_api.py b/bugbug/code_search/searchfox_api.py index 9aa7b76d5a..56d6579943 100644 --- a/bugbug/code_search/searchfox_api.py +++ b/bugbug/code_search/searchfox_api.py @@ -214,9 +214,18 @@ def search(commit_hash, symbol_name): class FunctionSearchSearchfoxAPI(FunctionSearch): - def __init__(self, get_file): + def __init__(self, get_file=None): super().__init__() - self.get_file = get_file + self.get_file = get_file or self._get_file + + @staticmethod + def _get_file(commit_hash, path): + r = utils.get_session("hgmo").get( + f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}", + headers={"User-Agent": utils.get_user_agent()}, + ) + r.raise_for_status() + return r.text def definitions_to_results(self, commit_hash, definitions): result = [] diff --git a/bugbug/tools/base.py b/bugbug/tools/base.py index c36b1a19c5..2680bd5f85 100644 --- a/bugbug/tools/base.py +++ b/bugbug/tools/base.py @@ -8,13 +8,5 @@ class GenerativeModelTool(ABC): - @property - @abstractmethod - def version(self) -> str: ... - @abstractmethod def run(self, *args, **kwargs) -> Any: ... - - @staticmethod - def _print_answer(answer): - print(f"\u001b[33;1m\033[1;3m{answer}\u001b[0m") diff --git a/bugbug/tools/code_review/agent.py b/bugbug/tools/code_review/agent.py index 203d95e2ba..32bd38c971 100644 --- a/bugbug/tools/code_review/agent.py +++ b/bugbug/tools/code_review/agent.py @@ -9,44 +9,39 @@ import os from datetime import datetime from logging import getLogger -from typing import Iterable, Optional +from typing import Optional, Protocol from langchain.agents import create_agent from langchain.agents.structured_output import ProviderStrategy from langchain.chat_models import BaseChatModel from langchain.messages import HumanMessage -from langchain_classic.chains import LLMChain -from langchain_classic.prompts import PromptTemplate from langgraph.errors import GraphRecursionError from pydantic import BaseModel, Field from unidiff import PatchSet from bugbug.code_search.function_search import FunctionSearch from bugbug.tools.base import GenerativeModelTool -from bugbug.tools.code_review.database import ReviewCommentsDB, SuggestionsFeedbackDB +from bugbug.tools.code_review.database import ReviewCommentsDB from bugbug.tools.code_review.langchain_tools import ( CodeReviewContext, create_find_function_definition_tool, expand_context, ) from bugbug.tools.code_review.prompts import ( - DEFAULT_REJECTED_EXAMPLES, FIRST_MESSAGE_TEMPLATE, - PROMPT_TEMPLATE_FILTERING_ANALYSIS, - PROMPT_TEMPLATE_SUMMARIZATION, STATIC_COMMENT_EXAMPLES, SYSTEM_PROMPT_TEMPLATE, TEMPLATE_COMMENT_EXAMPLE, TEMPLATE_PATCH_FROM_HUNK, ) from bugbug.tools.code_review.utils import ( + convert_generated_comments_to_inline, format_patch_set, - generate_processed_output, ) from bugbug.tools.core.data_types import InlineComment from bugbug.tools.core.exceptions import LargeDiffError, ModelResultError from bugbug.tools.core.llms import get_tokenizer -from bugbug.tools.core.platforms.base import Patch +from bugbug.tools.core.platforms.base import Patch, ReviewData logger = getLogger(__name__) @@ -73,25 +68,35 @@ class AgentResponse(BaseModel): ) -class CodeReviewTool(GenerativeModelTool): - version = "0.0.1" +class PatchSummarizer(Protocol): + def run(self, patch: Patch) -> str: ... + + +class SuggestionFilterer(Protocol): + def run( + self, suggestions: list[GeneratedReviewComment] + ) -> list[GeneratedReviewComment]: ... + +class CodeReviewTool(GenerativeModelTool): def __init__( self, llm: BaseChatModel, - summarization_llm: BaseChatModel, - filtering_llm: BaseChatModel, + patch_summarizer: PatchSummarizer, + suggestion_filterer: SuggestionFilterer, + review_data: ReviewData, function_search: Optional[FunctionSearch] = None, review_comments_db: Optional["ReviewCommentsDB"] = None, show_patch_example: bool = False, verbose: bool = True, - suggestions_feedback_db: Optional["SuggestionsFeedbackDB"] = None, target_software: str = "Mozilla Firefox", ) -> None: super().__init__() self.target_software = target_software + self.review_data = review_data + self._tokenizer = get_tokenizer( llm.model_name if hasattr(llm, "model_name") else "" ) @@ -108,24 +113,8 @@ def __init__( "----------------------------------------------------" ) - self.summarization_chain = LLMChain( - prompt=PromptTemplate.from_template( - PROMPT_TEMPLATE_SUMMARIZATION, - partial_variables={ - "experience_scope": f"the {self.target_software} source code" - }, - ), - llm=summarization_llm, - verbose=verbose, - ) - self.filtering_chain = LLMChain( - prompt=PromptTemplate.from_template( - PROMPT_TEMPLATE_FILTERING_ANALYSIS, - partial_variables={"target_code_consistency": self.target_software}, - ), - llm=filtering_llm, - verbose=verbose, - ) + self.patch_summarizer = patch_summarizer + self.suggestion_filterer = suggestion_filterer tools = [expand_context] if function_search: @@ -146,60 +135,77 @@ def __init__( self.verbose = verbose - self.suggestions_feedback_db = suggestions_feedback_db - @staticmethod - def create( - llm=None, summarization_llm=None, filtering_llm=None, **kwargs - ) -> "CodeReviewTool": - from bugbug.tools.core.llms import create_anthropic_llm - - return CodeReviewTool( - llm=llm - or create_anthropic_llm( + def create(**kwargs): + """Factory method to instantiate the tool with default dependencies. + + This method takes the same parameters as the constructor, but all + parameters are optional. If a parameter is not provided, a default + component will be created and used. + """ + if "function_search" not in kwargs: + from bugbug.code_search.searchfox_api import FunctionSearchSearchfoxAPI + + kwargs["function_search"] = FunctionSearchSearchfoxAPI() + + if "review_comments_db" not in kwargs: + from bugbug.tools.code_review.database import ReviewCommentsDB + from bugbug.vectordb import QdrantVectorDB + + kwargs["review_comments_db"] = ReviewCommentsDB( + QdrantVectorDB("diff_comments") + ) + + if "review_data" not in kwargs: + from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData + + kwargs["review_data"] = PhabricatorReviewData() + + if "llm" not in kwargs: + from bugbug.tools.core.llms import create_anthropic_llm + + kwargs["llm"] = create_anthropic_llm( model_name="claude-opus-4-5-20251101", max_tokens=40_000, temperature=None, thinking={"type": "enabled", "budget_tokens": 10_000}, - ), - summarization_llm=summarization_llm or create_anthropic_llm(), - filtering_llm=filtering_llm or create_anthropic_llm(), - **kwargs, - ) + ) - def count_tokens(self, text): - return len(self._tokenizer.encode(text)) + if "patch_summarizer" not in kwargs: + from bugbug.tools.patch_summarization.agent import PatchSummarizationTool - def generate_initial_prompt(self, patch: Patch) -> str: - formatted_patch = format_patch_set(patch.patch_set) + kwargs["patch_summarizer"] = PatchSummarizationTool.create() - output_summarization = self.summarization_chain.invoke( - { - "patch": formatted_patch, - "bug_title": patch.bug_title, - "patch_title": patch.patch_title, - "patch_description": patch.patch_description, - }, - return_only_outputs=True, - )["text"] + if "suggestion_filterer" not in kwargs: + from bugbug.tools.suggestion_filtering.agent import SuggestionFilteringTool - if self.verbose: - GenerativeModelTool._print_answer(output_summarization) + kwargs["suggestion_filterer"] = SuggestionFilteringTool.create() + return CodeReviewTool(**kwargs) + + def count_tokens(self, text): + return len(self._tokenizer.encode(text)) + + def generate_initial_prompt(self, patch: Patch, patch_summary: str) -> str: created_before = patch.date_created if self.is_experiment_env else None + return FIRST_MESSAGE_TEMPLATE.format( - patch=formatted_patch, - patch_summarization=output_summarization, + patch=format_patch_set(patch.patch_set), + patch_summarization=patch_summary, comment_examples=self._get_comment_examples(patch, created_before), approved_examples=self._get_generated_examples(patch, created_before), ) - def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]: + def generate_review_comments( + self, patch: Patch, patch_summary: str + ) -> list[GeneratedReviewComment]: try: for chunk in self.agent.stream( { "messages": [ - HumanMessage(self.generate_initial_prompt(patch)), + HumanMessage( + self.generate_initial_prompt(patch, patch_summary) + ), ] }, context=CodeReviewContext(patch=patch), @@ -212,35 +218,26 @@ def _generate_suggestions(self, patch: Patch) -> list[GeneratedReviewComment]: return result["structured_response"].comments + def run_by_diff_id(self, diff_id: str) -> list[InlineComment] | None: + patch = self.review_data.get_patch_by_id(diff_id) + return self.run(patch) + def run(self, patch: Patch) -> list[InlineComment] | None: if self.count_tokens(patch.raw_diff) > 21000: raise LargeDiffError("The diff is too large") - unfiltered_suggestions = self._generate_suggestions(patch) + patch_summary = self.patch_summarizer.run(patch) + + unfiltered_suggestions = self.generate_review_comments(patch, patch_summary) if not unfiltered_suggestions: logger.info("No suggestions were generated") return [] - rejected_examples = ( - "\n - ".join(self.get_similar_rejected_comments(unfiltered_suggestions)) - if self.suggestions_feedback_db - else DEFAULT_REJECTED_EXAMPLES - ) - - raw_output = self.filtering_chain.invoke( - { - "comments": str( - [comment.model_dump() for comment in unfiltered_suggestions] - ), - "rejected_examples": rejected_examples, - }, - return_only_outputs=True, - )["text"] - - if self.verbose: - GenerativeModelTool._print_answer(raw_output) + filtered_suggestions = self.suggestion_filterer.run(unfiltered_suggestions) - return list(generate_processed_output(raw_output, patch.patch_set)) + return list( + convert_generated_comments_to_inline(filtered_suggestions, patch.patch_set) + ) def _get_generated_examples(self, patch, created_before: datetime | None = None): """Get examples of comments that were generated by an LLM. @@ -320,24 +317,3 @@ def generate_formatted_patch_from_raw_hunk(raw_hunk, filename): ) for num, example in enumerate(comment_examples) ) - - def get_similar_rejected_comments( - self, suggestions: list[GeneratedReviewComment] - ) -> Iterable[str]: - if not self.suggestions_feedback_db: - raise Exception("Suggestions feedback database is not available") - - num_examples_per_suggestion = 10 // len(suggestions) or 1 - seen_ids: set[int] = set() - - for suggestion in suggestions: - similar_rejected_suggestions = ( - self.suggestions_feedback_db.find_similar_rejected_suggestions( - suggestion.comment, - limit=num_examples_per_suggestion, - excluded_ids=seen_ids, - ) - ) - for rejected_suggestion in similar_rejected_suggestions: - seen_ids.add(rejected_suggestion.id) - yield rejected_suggestion.comment diff --git a/bugbug/tools/code_review/prompts.py b/bugbug/tools/code_review/prompts.py index 9b253c026b..7bffd5c442 100644 --- a/bugbug/tools/code_review/prompts.py +++ b/bugbug/tools/code_review/prompts.py @@ -7,31 +7,6 @@ TARGET_SOFTWARE: str | None = None -PROMPT_TEMPLATE_SUMMARIZATION = """You are an expert reviewer for {experience_scope}, with experience on source code reviews. - -Please, analyze the code provided and report a summarization about the new changes; for that, focus on the coded added represented by lines that start with "+". - -The summarization should have two parts: - 1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request. - 2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent. - -Do not include any code in the summarization, only a description of the changes. - -**Bug title**: - -{bug_title} - - -**Commit message**: - -{patch_title} -{patch_description} - - -**Diff**: - -{patch} -""" SYSTEM_PROMPT_TEMPLATE = """You are an expert {target_software} engineer tasked with analyzing a pull request and providing high-quality review comments. You will examine a code patch and generate constructive feedback focusing on potential issues in the changed code. @@ -113,36 +88,6 @@ {comments}""" -PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that: -- are consistent with the {target_code_consistency} source code; -- focus on reporting possible bugs, functional regressions, issues, or similar concerns; -- report readability or design concerns. - -Exclude comments that: -- only describe the change; -- restate obvious facts like renamed variables or replaced code; -- include praising; -- ask if changes are intentional or ask to ensure things exist. - -Only return a valid JSON list. Do not drop any key from the JSON objects. - -Comments: -{comments} - -As examples of not expected comments, not related to the current patch, please, check some below: - - {rejected_examples} -""" - - -DEFAULT_REJECTED_EXAMPLES = """Please note that these are minor improvements and the overall quality of the patch is good. The documentation is being expanded in a clear and structured way, which will likely be beneficial for future development. - - Please note that these are just suggestions and the code might work perfectly fine as it is. It's always a good idea to test all changes thoroughly to ensure they work as expected. - - Overall, the patch seems to be well implemented with no major concerns. The developers have made a conscious decision to align with Chrome's behavior, and the reasoning is well documented. - - There are no complex code changes in this patch, so there's no potential for major readability regressions or bugs introduced by the changes. - - The `focus(...)` method is called without checking if the element and its associated parameters exist or not. It would be better to check if the element exists before calling the `focus()` method to avoid potential errors. - - It's not clear if the `SearchService.sys.mjs` file exists or not. If it doesn't exist, this could cause an error. Please ensure that the file path is correct. - - This is a good addition to the code.""" - - STATIC_COMMENT_EXAMPLES = [ { "comment": { diff --git a/bugbug/tools/code_review/scorer.py b/bugbug/tools/code_review/scorer.py new file mode 100644 index 0000000000..fac5aab475 --- /dev/null +++ b/bugbug/tools/code_review/scorer.py @@ -0,0 +1,285 @@ +from functools import cached_property +from logging import getLogger + +import weave + +from bugbug.tools.comment_matching.agent import CommentMatchingTool +from bugbug.tools.suggestion_filtering.agent import SuggestionFilteringTool + +logger = getLogger(__name__) + + +class BasicMetricsScorer(weave.Scorer): + """Score basic metrics: comment counts and error tracking.""" + + @weave.op() + def score( + self, + output: dict, + ground_truth_comments: list[dict], + ) -> dict: + valid_comment_count = sum( + comment["evaluation"] == "VALID" for comment in ground_truth_comments + ) + invalid_comment_count = sum( + comment["evaluation"] == "INVALID" for comment in ground_truth_comments + ) + + return { + "generated_comment_count": len(output["comments"]), + "ground_truth_valid_count": valid_comment_count, + "ground_truth_invalid_count": invalid_comment_count, + "ground_truth_total_count": len(ground_truth_comments), + "successful": not output["error"], + } + + def summarize(self, score_rows: list[dict]) -> dict: + """Aggregate scores across all examples.""" + total_generated = sum(r["generated_comment_count"] for r in score_rows) + total_gt_valid = sum(r["ground_truth_valid_count"] for r in score_rows) + total_gt_invalid = sum(r["ground_truth_invalid_count"] for r in score_rows) + total_gt = sum(r["ground_truth_total_count"] for r in score_rows) + error_count = sum(not r["successful"] for r in score_rows) + + return { + "total_generated_comments": total_generated, + "total_ground_truth_valid": total_gt_valid, + "total_ground_truth_invalid": total_gt_invalid, + "total_ground_truth": total_gt, + "avg_generated_per_diff": ( + total_generated / len(score_rows) if score_rows else 0 + ), + "error_rate": error_count / len(score_rows) if score_rows else 0, + "num_examples": len(score_rows), + } + + +class LLMCommentMatchingScorer(weave.Scorer): + """Score comment matching using LLM-based semantic comparison. + + This scorer uses an LLM to match generated comments against ground truth + comments, calculating recall and precision metrics. + """ + + @cached_property + def matching_tool(self): + return CommentMatchingTool.create() + + @cached_property + def filtering_tool(self): + return SuggestionFilteringTool.create() + + @weave.op() + def score( + self, + output: dict, + ground_truth_comments: list[dict], + diff_id: int, + ) -> dict: + generated_comments = output["comments"] + + retained_indices = set( + self.filtering_tool.get_indices_of_retained_comments(generated_comments) + ) + retained_comments = [ + c for i, c in enumerate(generated_comments) if i in retained_indices + ] + excluded_comments = [ + c for i, c in enumerate(generated_comments) if i not in retained_indices + ] + + old_comments = [ + {"id": i, "content": c["comment"], "file": c["file_path"]} + for i, c in enumerate(ground_truth_comments) + ] + + new_comments = [ + {"id": i, "content": c.comment, "file": c.file} + for i, c in enumerate(generated_comments) + ] + + matches = self.matching_tool.run( + old_comments=old_comments, new_comments=new_comments + ) + + seen_old: set[int] = set() + seen_new: set[int] = set() + matched_valid_retained = [] + matched_valid_excluded = [] + matched_invalid_retained = [] + matched_invalid_excluded = [] + + for match in matches: + old_idx = match.old_comment_id + new_idx = match.new_comment_id + + if old_idx >= len(ground_truth_comments) or new_idx >= len( + generated_comments + ): + continue + + # Validate file match + gt_comment = ground_truth_comments[old_idx] + gen_comment = generated_comments[new_idx] + + if gt_comment["file_path"] != gen_comment.file: + logger.debug( + f"File mismatch for diff {diff_id}: " + f"{gt_comment['file_path']} != {gen_comment.file}" + ) + continue + + seen_old.add(old_idx) + seen_new.add(new_idx) + + is_retained = new_idx in retained_indices + match_comments = { + "ground_truth_comment": gt_comment, + "generated_comment": gen_comment, + } + + if gt_comment["evaluation"] == "VALID": + if is_retained: + matched_valid_retained.append(match_comments) + else: + matched_valid_excluded.append(match_comments) + else: + if is_retained: + matched_invalid_retained.append(match_comments) + else: + matched_invalid_excluded.append(match_comments) + + unmatched_gt_valid = [] + unmatched_gt_invalid = [] + + for i in range(len(ground_truth_comments)): + if i in seen_old: + continue + + comment = ground_truth_comments[i] + evaluation = ground_truth_comments[i]["evaluation"] + if evaluation == "VALID": + unmatched_gt_valid.append(comment) + else: + unmatched_gt_invalid.append(comment) + + unmatched_gen_retained = [] + unmatched_gen_excluded = [] + + for i in range(len(generated_comments)): + if i in seen_new: + continue + + comment = new_comments[i] + if i in retained_indices: + unmatched_gen_retained.append(comment) + else: + unmatched_gen_excluded.append(comment) + + return { + # Matched counts (derived from lists) + "matched_valid_count": len(matched_valid_retained) + + len(matched_valid_excluded), + "matched_invalid_count": len(matched_invalid_retained) + + len(matched_invalid_excluded), + # Unmatched counts + "unmatched_generated_count": len(unmatched_gen_retained) + + len(unmatched_gen_excluded), + "unmatched_ground_truth_valid_count": len(unmatched_gt_valid), + "unmatched_ground_truth_invalid_count": len(unmatched_gt_invalid), + # Unmatched details + "unmatched_ground_truth_valid": unmatched_gt_valid, + "unmatched_ground_truth_invalid": unmatched_gt_invalid, + "unmatched_gen_retained": unmatched_gen_retained, + "unmatched_gen_excluded": unmatched_gen_excluded, + # Filtering metrics + "filtering_retained_count": len(retained_comments), + "filtering_excluded_count": len(excluded_comments), + "filtering_retention_rate": ( + len(retained_comments) / len(generated_comments) + if generated_comments + else 0 + ), + "filtering_retained_comments": retained_comments, + "filtering_excluded_comments": excluded_comments, + # Filtering x Matching breakdown (lists with details) + "matched_valid_retained": matched_valid_retained, + "matched_valid_excluded": matched_valid_excluded, + "matched_invalid_retained": matched_invalid_retained, + "matched_invalid_excluded": matched_invalid_excluded, + } + + def summarize(self, score_rows: list[dict]) -> dict: + total_matched_valid = sum(r["matched_valid_count"] for r in score_rows) + total_matched_invalid = sum(r["matched_invalid_count"] for r in score_rows) + total_unmatched_gen = sum(r["unmatched_generated_count"] for r in score_rows) + total_unmatched_gt_valid = sum( + r["unmatched_ground_truth_valid_count"] for r in score_rows + ) + total_unmatched_gt_invalid = sum( + r["unmatched_ground_truth_invalid_count"] for r in score_rows + ) + + total_gt_valid = total_matched_valid + total_unmatched_gt_valid + total_gt_invalid = total_matched_invalid + total_unmatched_gt_invalid + + # Filtering aggregates + total_retained = sum(r["filtering_retained_count"] for r in score_rows) + total_excluded = sum(r["filtering_excluded_count"] for r in score_rows) + total_generated = total_retained + total_excluded + + # Filtering x Matching aggregates (use len() since values are lists) + total_matched_valid_retained = sum( + len(r["matched_valid_retained"]) for r in score_rows + ) + total_matched_valid_excluded = sum( + len(r["matched_valid_excluded"]) for r in score_rows + ) + total_matched_invalid_retained = sum( + len(r["matched_invalid_retained"]) for r in score_rows + ) + total_matched_invalid_excluded = sum( + len(r["matched_invalid_excluded"]) for r in score_rows + ) + + return { + "total_matched_valid": total_matched_valid, + "total_matched_invalid": total_matched_invalid, + "total_unmatched_generated": total_unmatched_gen, + "recall_valid": ( + total_matched_valid / total_gt_valid if total_gt_valid > 0 else 0 + ), + "recall_invalid": ( + total_matched_invalid / total_gt_invalid if total_gt_invalid > 0 else 0 + ), + "missed_valid_rate": ( + total_unmatched_gt_valid / total_gt_valid if total_gt_valid > 0 else 0 + ), + "missed_invalid_rate": ( + total_unmatched_gt_invalid / total_gt_invalid + if total_gt_invalid > 0 + else 0 + ), + # Filtering summary metrics + "total_filtering_retained": total_retained, + "total_filtering_excluded": total_excluded, + "overall_retention_rate": ( + total_retained / total_generated if total_generated > 0 else 0 + ), + # Filtering x Matching summary + "total_matched_valid_retained": total_matched_valid_retained, + "total_matched_valid_excluded": total_matched_valid_excluded, + "total_matched_invalid_retained": total_matched_invalid_retained, + "total_matched_invalid_excluded": total_matched_invalid_excluded, + # Derived rates + "false_exclusion_rate": ( + total_matched_valid_excluded / total_matched_valid + if total_matched_valid > 0 + else 0 + ), + "true_exclusion_rate": ( + total_matched_invalid_excluded / total_matched_invalid + if total_matched_invalid > 0 + else 0 + ), + } diff --git a/bugbug/tools/code_review/utils.py b/bugbug/tools/code_review/utils.py index ce5c5e85dc..9bc648e440 100644 --- a/bugbug/tools/code_review/utils.py +++ b/bugbug/tools/code_review/utils.py @@ -4,7 +4,6 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. -import json import re from itertools import chain from logging import getLogger @@ -343,37 +342,24 @@ def get_structured_functions(target, functions_declaration): return function_declaration_text -def parse_model_output(output: str) -> list[dict]: - # TODO: This function should raise an exception when the output is not valid - # JSON. The caller can then decide how to handle the error. - # For now, we just log the error and return an empty list. +def convert_generated_comments_to_inline( + comments, patch: PatchSet +) -> Iterable[InlineComment]: + """Convert GeneratedReviewComment objects to InlineComment objects. - json_opening_tag = output.find("```json") - if json_opening_tag != -1: - json_closing_tag = output.rfind("```") - if json_closing_tag == -1: - logger.error("No closing ``` found for JSON in model output: %s", output) - return [] - output = output[json_opening_tag + len("```json") : json_closing_tag] - - try: - comments = json.loads(output) - except json.JSONDecodeError: - logger.error("Failed to parse JSON from model output: %s", output) - return [] - - return comments - - -def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineComment]: - comments = parse_model_output(output) + Args: + comments: List of GeneratedReviewComment objects. + patch: The PatchSet to validate file paths against. + Yields: + InlineComment objects with proper scope information. + """ patched_files_map = { patched_file.target_file: patched_file for patched_file in patch } for comment in comments: - file_path = comment["file"] + file_path = comment.file if not file_path.startswith("b/") and not file_path.startswith("a/"): file_path = "b/" + file_path @@ -385,7 +371,7 @@ def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineCo f"The file `{file_path}` is not part of the patch: {list(patched_files_map)}" ) - line_number = comment["code_line"] + line_number = comment.code_line if not isinstance(line_number, int): raise ModelResultError("Line number must be an integer") @@ -401,8 +387,8 @@ def generate_processed_output(output: str, patch: PatchSet) -> Iterable[InlineCo end_line=line_number, hunk_start_line=scope["line_start"], hunk_end_line=scope["line_end"], - content=comment["comment"], + content=comment.comment, on_removed_code=not scope["has_added_lines"], - explanation=comment["explanation"], - order=comment["order"], + explanation=comment.explanation, + order=comment.order, ) diff --git a/bugbug/tools/comment_matching/agent.py b/bugbug/tools/comment_matching/agent.py new file mode 100644 index 0000000000..2da194a5b4 --- /dev/null +++ b/bugbug/tools/comment_matching/agent.py @@ -0,0 +1,62 @@ +from langchain.agents import create_agent +from langchain.agents.structured_output import ProviderStrategy +from langchain.chat_models import BaseChatModel +from langchain.messages import HumanMessage +from pydantic import BaseModel, Field + +from bugbug.tools.comment_matching.prompts import FIRST_MESSAGE_TEMPLATE, SYSTEM_PROMPT + + +class MatchingComment(BaseModel): + id: int = Field(description="Unique identifier for the comment") + content: str = Field(description="Content of the code review comment") + file: str = Field(description="File path of the comment") + + +class CommentMatch(BaseModel): + old_comment_id: int = Field(description="ID of the comment from the old set") + new_comment_id: int = Field(description="ID of the comment from the new set") + + +class AgentResponse(BaseModel): + matches: list[CommentMatch] = Field( + description="List of matched comment pairs between old and new comments" + ) + + +class CommentMatchingTool: + def __init__(self, llm: BaseChatModel): + self.agent = create_agent( + llm, + system_prompt=SYSTEM_PROMPT, + response_format=ProviderStrategy(AgentResponse), + ) + + def create(**kwargs): + if "llm" not in kwargs: + from bugbug.tools.core.llms import create_openai_llm + + kwargs["llm"] = create_openai_llm() + + return CommentMatchingTool(**kwargs) + + def run( + self, old_comments: list[MatchingComment], new_comments: list[MatchingComment] + ) -> list[CommentMatch]: + if not old_comments or not new_comments: + return [] + + response = self.agent.invoke( + { + "messages": [ + HumanMessage( + FIRST_MESSAGE_TEMPLATE.format( + old_comments=old_comments, + new_comments=new_comments, + ) + ) + ] + } + ) + + return response["structured_response"].matches diff --git a/bugbug/tools/comment_matching/prompts.py b/bugbug/tools/comment_matching/prompts.py new file mode 100644 index 0000000000..8be398afc1 --- /dev/null +++ b/bugbug/tools/comment_matching/prompts.py @@ -0,0 +1,29 @@ +SYSTEM_PROMPT = """You are an expert in code review at Mozilla Firefox. + +**Task**: + +Match two sets of code review comments to identify redundant comments. + +**Instructions**: + + 1. **Consider the following about all comments**: + - The comments are related to the same code patch. + - The comments may be written in different styles. + + 2. **Understand what each comment is addressing**: + - Read the comments in both sets. + - Understand the issue that each comment is addressing. + + 3. **Check for matches**: + - If you find a comment in the old set that is addressing the same issue as a comment in the new set, link them as redundant. + - The comments may not be identical, but they should be addressing the same issue. + - The level of detail in the comments may vary. +""" + +FIRST_MESSAGE_TEMPLATE = """**First set of comments (old comments / ground truth)**: + +{old_comments} + +**Second set of comments (new comments / generated)**: + +{new_comments}""" diff --git a/bugbug/tools/core/platforms/base.py b/bugbug/tools/core/platforms/base.py index c63102d780..0805e3b25a 100644 --- a/bugbug/tools/core/platforms/base.py +++ b/bugbug/tools/core/platforms/base.py @@ -79,7 +79,7 @@ def get_review_request_by_id(self, review_id: int): raise NotImplementedError @abstractmethod - def get_patch_by_id(self, patch_id: str) -> Patch: + def get_patch_by_id(self, patch_id: str | int) -> Patch: raise NotImplementedError @abstractmethod diff --git a/bugbug/tools/core/platforms/phabricator.py b/bugbug/tools/core/platforms/phabricator.py index 8f0defdd57..4122a60060 100644 --- a/bugbug/tools/core/platforms/phabricator.py +++ b/bugbug/tools/core/platforms/phabricator.py @@ -497,7 +497,7 @@ def get_review_request_by_id(self, revision_id: int) -> ReviewRequest: wait=tenacity.wait_exponential(multiplier=2, min=2), reraise=True, ) - def get_patch_by_id(self, patch_id: str) -> Patch: + def get_patch_by_id(self, patch_id: str | int) -> Patch: return PhabricatorPatch(patch_id) def get_all_inline_comments( diff --git a/bugbug/tools/core/platforms/swarm.py b/bugbug/tools/core/platforms/swarm.py index 0fbf234d96..13cca92fda 100644 --- a/bugbug/tools/core/platforms/swarm.py +++ b/bugbug/tools/core/platforms/swarm.py @@ -81,8 +81,8 @@ def __init__(self): def get_review_request_by_id(self, revision_id: int) -> ReviewRequest: return ReviewRequest(revision_id) - def get_patch_by_id(self, patch_id: str) -> Patch: - return SwarmPatch(patch_id, self.auth) + def get_patch_by_id(self, patch_id: str | int) -> Patch: + return SwarmPatch(str(patch_id), self.auth) def get_all_inline_comments( self, comment_filter diff --git a/bugbug/tools/patch_summarization/agent.py b/bugbug/tools/patch_summarization/agent.py new file mode 100644 index 0000000000..9e09935f25 --- /dev/null +++ b/bugbug/tools/patch_summarization/agent.py @@ -0,0 +1,48 @@ +from langchain.agents import create_agent +from langchain.chat_models import BaseChatModel +from langchain.messages import HumanMessage + +from bugbug.tools.base import GenerativeModelTool +from bugbug.tools.code_review.utils import format_patch_set +from bugbug.tools.core.platforms.base import Patch +from bugbug.tools.patch_summarization.prompts import PROMPT_TEMPLATE_SUMMARIZATION + + +class PatchSummarizationTool(GenerativeModelTool): + def __init__(self, llm: BaseChatModel, target_software: str = "Mozilla Firefox"): + self.target_software = target_software + self.agent = create_agent(llm) + + def run(self, patch: Patch): + result = self.agent.invoke( + { + "messages": [ + HumanMessage( + PROMPT_TEMPLATE_SUMMARIZATION.format( + target_software=self.target_software, + patch=format_patch_set(patch.patch_set), + bug_title=patch.bug_title, + patch_title=patch.patch_title, + patch_description=patch.patch_description, + ) + ) + ] + } + ) + + return result["messages"][-1].content + + @staticmethod + def create(**kwargs): + """Factory method to instantiate the tool with default dependencies. + + This method takes the same parameters as the constructor, but all + parameters are optional. If a parameter is not provided, a default + component will be created and used. + """ + if "llm" not in kwargs: + from bugbug.tools.core.llms import create_anthropic_llm + + kwargs["llm"] = create_anthropic_llm() + + return PatchSummarizationTool(**kwargs) diff --git a/bugbug/tools/patch_summarization/prompts.py b/bugbug/tools/patch_summarization/prompts.py new file mode 100644 index 0000000000..fa0e167c18 --- /dev/null +++ b/bugbug/tools/patch_summarization/prompts.py @@ -0,0 +1,25 @@ +PROMPT_TEMPLATE_SUMMARIZATION = """You are an expert reviewer for {target_software}, with experience on source code reviews. + +Please, analyze the code provided and report a summarization about the new changes; for that, focus on the coded added represented by lines that start with "+". + +The summarization should have two parts: + 1. **Intent**: Describe the intent of the changes, what they are trying to achieve, and how they relate to the bug or feature request. + 2. **Solution**: Describe the solution implemented in the code changes, focusing on how the changes address the intent. + +Do not include any code in the summarization, only a description of the changes. + +**Bug title**: + +{bug_title} + + +**Commit message**: + +{patch_title} +{patch_description} + + +**Diff**: + +{patch} +""" diff --git a/bugbug/tools/suggestion_filtering/agent.py b/bugbug/tools/suggestion_filtering/agent.py new file mode 100644 index 0000000000..55a0623e0c --- /dev/null +++ b/bugbug/tools/suggestion_filtering/agent.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Suggestion filtering tool implementation.""" + +from typing import Iterable, Optional + +from langchain.agents import create_agent +from langchain.agents.structured_output import ProviderStrategy +from langchain.chat_models import BaseChatModel +from langchain.messages import HumanMessage +from pydantic import BaseModel, Field + +from bugbug.tools.base import GenerativeModelTool +from bugbug.tools.code_review.agent import GeneratedReviewComment +from bugbug.tools.code_review.database import SuggestionsFeedbackDB +from bugbug.tools.suggestion_filtering.prompts import ( + DEFAULT_REJECTED_EXAMPLES, + PROMPT_TEMPLATE_FILTERING_ANALYSIS, +) + + +class FilteredComments(BaseModel): + """The response from the filtering agent.""" + + comment_indices: list[int] = Field( + description="A list of indices of the comments that were kept after filtering" + ) + + +class SuggestionFilteringTool(GenerativeModelTool): + """Tool for filtering generated review comments. + + Filters out low-quality suggestions using an LLM agent with + optional dynamic rejected examples from a feedback database. + """ + + def __init__( + self, + llm: BaseChatModel, + target_software: str = "Mozilla Firefox", + suggestions_feedback_db: Optional[SuggestionsFeedbackDB] = None, + ) -> None: + self.target_software = target_software + self.suggestions_feedback_db = suggestions_feedback_db + self.agent = create_agent( + llm, + response_format=ProviderStrategy(FilteredComments), + ) + + def get_indices_of_retained_comments( + self, suggestions: list[GeneratedReviewComment] + ) -> list[int]: + """Get indices of comments to keep after filtering. + + Args: + suggestions: List of generated review comments to filter. + + Returns: + List of indices of comments to keep. + """ + if not suggestions: + return [] + + rejected_examples = self._get_rejected_examples(suggestions) + + formatted_comments = "\n".join( + f"Index {i}: {comment.model_dump()}" + for i, comment in enumerate(suggestions) + ) + + prompt = PROMPT_TEMPLATE_FILTERING_ANALYSIS.format( + target_code_consistency=self.target_software, + comments=formatted_comments, + rejected_examples=rejected_examples, + ) + + result = self.agent.invoke({"messages": [HumanMessage(prompt)]}) + + return result["structured_response"].comment_indices + + def run( + self, suggestions: list[GeneratedReviewComment] + ) -> list[GeneratedReviewComment]: + """Filter the given suggestions and return filtered comments. + + Args: + suggestions: List of generated review comments to filter. + + Returns: + List of filtered GeneratedReviewComment objects. + """ + return [ + suggestions[i] for i in self.get_indices_of_retained_comments(suggestions) + ] + + def _get_rejected_examples(self, suggestions: list[GeneratedReviewComment]) -> str: + """Get rejected examples for filtering. + + Uses dynamic examples from feedback database if available, + otherwise falls back to default static examples. + """ + if not self.suggestions_feedback_db: + return DEFAULT_REJECTED_EXAMPLES + + rejected_comments = list(self._get_similar_rejected_comments(suggestions)) + if not rejected_comments: + return DEFAULT_REJECTED_EXAMPLES + + return "\n - ".join(rejected_comments) + + def _get_similar_rejected_comments( + self, suggestions: list[GeneratedReviewComment] + ) -> Iterable[str]: + """Find similar rejected comments from the feedback database.""" + if not self.suggestions_feedback_db: + raise Exception("Suggestions feedback database is not available") + + num_examples_per_suggestion = 10 // len(suggestions) or 1 + seen_ids: set[int] = set() + + for suggestion in suggestions: + similar_rejected_suggestions = ( + self.suggestions_feedback_db.find_similar_rejected_suggestions( + suggestion.comment, + limit=num_examples_per_suggestion, + excluded_ids=seen_ids, + ) + ) + for rejected_suggestion in similar_rejected_suggestions: + seen_ids.add(rejected_suggestion.id) + yield rejected_suggestion.comment + + @staticmethod + def create(**kwargs): + """Factory method to instantiate the tool with default dependencies. + + This method takes the same parameters as the constructor, but all + parameters are optional. If a parameter is not provided, a default + component will be created and used. + """ + if "suggestions_feedback_db" not in kwargs: + from bugbug.vectordb import QdrantVectorDB + + kwargs["suggestions_feedback_db"] = SuggestionsFeedbackDB( + QdrantVectorDB("suggestions_feedback") + ) + + if "llm" not in kwargs: + from bugbug.tools.core.llms import create_anthropic_llm + + kwargs["llm"] = create_anthropic_llm() + + return SuggestionFilteringTool(**kwargs) diff --git a/bugbug/tools/suggestion_filtering/prompts.py b/bugbug/tools/suggestion_filtering/prompts.py new file mode 100644 index 0000000000..8478e3ac17 --- /dev/null +++ b/bugbug/tools/suggestion_filtering/prompts.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +"""Prompt templates for suggestion filtering.""" + +PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that: +- are consistent with the {target_code_consistency} source code; +- focus on reporting possible bugs, functional regressions, issues, or similar concerns; +- report readability or design concerns. + +Exclude comments that: +- only describe the change; +- restate obvious facts like renamed variables or replaced code; +- include praising; +- ask if changes are intentional or ask to ensure things exist. + +Comments: + +{comments} + + +As examples of not expected comments, not related to the current patch, please, check some below: + + - {rejected_examples} + +""" + + +DEFAULT_REJECTED_EXAMPLES = """Please note that these are minor improvements and the overall quality of the patch is good. The documentation is being expanded in a clear and structured way, which will likely be beneficial for future development. + - Please note that these are just suggestions and the code might work perfectly fine as it is. It's always a good idea to test all changes thoroughly to ensure they work as expected. + - Overall, the patch seems to be well implemented with no major concerns. The developers have made a conscious decision to align with Chrome's behavior, and the reasoning is well documented. + - There are no complex code changes in this patch, so there's no potential for major readability regressions or bugs introduced by the changes. + - The `focus(...)` method is called without checking if the element and its associated parameters exist or not. It would be better to check if the element exists before calling the `focus()` method to avoid potential errors. + - It's not clear if the `SearchService.sys.mjs` file exists or not. If it doesn't exist, this could cause an error. Please ensure that the file path is correct. + - This is a good addition to the code.""" diff --git a/experiments/review_helper_modify_filtering_step.ipy b/experiments/review_helper_modify_filtering_step.ipy deleted file mode 100644 index 26d764a93f..0000000000 --- a/experiments/review_helper_modify_filtering_step.ipy +++ /dev/null @@ -1,306 +0,0 @@ -# %% -%pip install -q bugbug==0.0.570 - -# %% -from bugbug import utils -from bugbug.code_search.mozilla import FunctionSearchMozilla -from bugbug.tools.code_review.utils import generate_processed_output, parse_model_output - - -def get_file(commit_hash, path): - r = utils.get_session("hgmo").get( - f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}", - headers={ - "User-Agent": utils.get_user_agent(), - }, - ) - r.raise_for_status() - return r.text - - -repo_dir = "../mozilla-unified" -function_search = FunctionSearchMozilla(repo_dir, get_file, True) - - -# %% -from dotenv import load_dotenv - -from bugbug.generative_model_tool import create_openai_llm - -load_dotenv("~/.env") - -llm = create_openai_llm() - -# %% -from bugbug.tools import code_review -from bugbug.vectordb import QdrantVectorDB - -review_comments_db = code_review.ReviewCommentsDB(QdrantVectorDB("diff_comments")) - - -suggestions_feedback_db = code_review.SuggestionsFeedbackDB( - QdrantVectorDB("suggestions_feedback") -) - -review_platform = "phabricator" -review_data: code_review.ReviewData = code_review.review_data_classes[review_platform]() - - -# %% -tool = code_review.CodeReviewTool( - llm=llm, - # function_search=function_search, - review_comments_db=review_comments_db, - suggestions_feedback_db=suggestions_feedback_db, - verbose=False, -) - - -# %% -from langchain.chains import LLMChain -from langchain.prompts import PromptTemplate - -partial_variables = { - "target_code_consistency": "Mozilla Firefox", -} - -current_filtering_chain = LLMChain( - prompt=PromptTemplate.from_template( - code_review.PROMPT_TEMPLATE_FILTERING_ANALYSIS, - partial_variables=partial_variables, - ), - llm=llm, -) - - -# %% -from scripts.code_review_tool_evaluator import ( - FeedbackEvaluator, -) - -evaluator = FeedbackEvaluator("../bugbug_evaluation_datasets/evaluated_comments.csv") - - -# %% -import pandas as pd - - -def get_latest_evaluation_results_file(): - import glob - - files = glob.glob("evaluation_results_*#*.csv") - if not files: - raise FileNotFoundError( - "No evaluation results file found. The file name should have `#` followed by a PR number." - ) - - return max(files) - - -# SELECTION (Option 1: same as before) -selected_review_requests = [ - (revision_id, code_review.ReviewRequest(diff_id)) - for revision_id, diff_id in pd.read_csv(get_latest_evaluation_results_file())[ - ["revision_id", "diff_id"] - ] - .drop_duplicates() - .itertuples(name=None, index=False) -] - -# SELECTION (Option 2: random sample) -# selected_review_requests = [ -# (revision_id, code_review.ReviewRequest(diff_id)) -# for revision_id, diff_id in evaluator.evaluated_comments -# # .query("evaluation == 'CORRECT'") -# [["revision_id", "diff_id"]].drop_duplicates().sample(100).itertuples(index=False) -# ] - - -# %% - - -def common_step(): - for review_request_id, review_request in selected_review_requests: - print("---------------------------------------------------------") - print(f"Review Request ID: {review_request_id}") - print(f"Patch ID: {review_request.patch_id}") - patch = review_data.get_patch_by_id(review_request.patch_id) - print("---------------------------------------------------------") - - try: - output = tool._generate_suggestions(patch) - except code_review.FileNotInPatchError as e: - print("Error while running the tool:", e) - continue - except code_review.LargeDiffError: - print("Skipping the patch because it is too large.") - continue - - unfiltered_suggestions = parse_model_output(output) - if not unfiltered_suggestions: - print("No suggestions were generated") - continue - - rejected_examples = ( - "\n - ".join(tool.get_similar_rejected_comments(unfiltered_suggestions)) - if tool.suggestions_feedback_db - else code_review.DEFAULT_REJECTED_EXAMPLES - ) - - raw_output = current_filtering_chain.invoke( - { - "review": output, - "patch": patch.raw_diff, - "rejected_examples": rejected_examples, - }, - return_only_outputs=True, - )["text"] - - try: - comments = list( - generate_processed_output(raw_output, patch.patch_set) - ) - except code_review.FileNotInPatchError as e: - print("Error while running the tool:", e) - continue - - # print_prettified_comments(comments) - evaluation = evaluator.evaluate_diff_comments(review_request.patch_id, comments) - evaluation_for_original = [ - { - "variant_name": "Current Filtering", - "revision_id": review_request_id, - "diff_id": review_request.patch_id, - **row, - } - for row in evaluation - ] - - yield ( - rejected_examples, - output, - patch, - review_request_id, - review_request, - evaluation_for_original, - ) - - -common_step_results = list(common_step()) - -# %% -import pickle - -if "common_step_results" not in locals(): - raise ValueError( - "common_step_results is not defined. Please run the common_step() function first." - ) - - -with open("common_step_results.pkl", "wb") as f: - pickle.dump(common_step_results, f) - -# %% -import pickle - -with open("common_step_results.pkl", "rb") as f: - common_step_results = pickle.load(f) - -# %% -from datetime import datetime - -NEW__PROMPT_TEMPLATE_FILTERING_ANALYSIS = """Filter review comments to keep those that: -- are consistent with the {target_code_consistency} source code; -- focus on reporting possible bugs, functional regressions, issues, or similar concerns; -- report readability or design concerns. - -Exclude comments that: -- only describe the change; -- restate obvious facts like renamed variables or replaced code; -- include praising; -- ask if changes are intentional or ask to ensure things exist. - -Do not report any explanation about your choice. Only return a valid JSON list. - -Comments: -{comments} - -As examples of not expected comments, not related to the current patch, please, check some below: - - {rejected_examples} -""" - -new_filtering_chain = LLMChain( - prompt=PromptTemplate.from_template( - NEW__PROMPT_TEMPLATE_FILTERING_ANALYSIS, - partial_variables=partial_variables, - ), - llm=llm, -) - - -all_variants_evaluation_results = [] - -for ( - rejected_examples, - output, - patch, - review_request_id, - review_request, - evaluation_for_original, -) in common_step_results: - - all_variants_evaluation_results.extend(evaluation_for_original) - - raw_output = new_filtering_chain.invoke( - { - "comments": output, - "rejected_examples": rejected_examples, - }, - return_only_outputs=True, - )["text"] - - comments = list(generate_processed_output(raw_output, patch.patch_set)) - - # print_prettified_comments(comments) - evaluation = evaluator.evaluate_diff_comments(review_request.patch_id, comments) - all_variants_evaluation_results.extend( - { - "variant_name": "New Filtering", - "revision_id": review_request_id, - "diff_id": review_request.patch_id, - **row, - } - for row in evaluation - ) - - evaluation_result_all_columns = [ - "variant_name", - "revision_id", - "diff_id", - "new_comment", - "old_comments_count", - "matched", - "old_comment", - "evaluation", - ] - - -evaluation_results_file = ( - f"evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv" -) - - -df = pd.DataFrame( - all_variants_evaluation_results, columns=evaluation_result_all_columns -) -df.to_csv( - evaluation_results_file, - index=False, - header=True, - mode="w", -) - - -# %% -print("Done!") -# %% diff --git a/notebooks/code_review_create_dataset.ipynb b/notebooks/code_review_create_dataset.ipynb new file mode 100644 index 0000000000..96cdf91e42 --- /dev/null +++ b/notebooks/code_review_create_dataset.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "af4f2030", + "metadata": {}, + "source": [ + "# Create Dataset for Code Review Evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "f0f37fd7", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f57608e", + "metadata": {}, + "outputs": [], + "source": [ + "import weave\n", + "\n", + "PROJECT_NAME = \"bugbug-code-review-eval\"\n", + "DATASET_NAME = \"code_review_eval\"\n", + "\n", + "_ = weave.init(PROJECT_NAME)" + ] + }, + { + "cell_type": "markdown", + "id": "255ccaac", + "metadata": {}, + "source": [ + "## Prepare the Data\n", + "\n", + "### Load the feedback data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bd97ef4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\n", + " \"https://docs.google.com/spreadsheets/d/e/2PACX-1vTQLbFpQmzEv4zxaKJprosIt6RWs4jDAN6IDT6xB7EcelD0ZR8qilt2tpscUSGEwMNWVpHKDlPacA7b/pub?output=csv\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e6f4f74a", + "metadata": {}, + "source": [ + "### Reuse summaries from the last dataset version:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb3fee95", + "metadata": {}, + "outputs": [], + "source": [ + "summaries = {\n", + " row[\"diff_id\"]: row[\"patch_summary\"]\n", + " for row in weave.ref(DATASET_NAME).get()\n", + " if row.get(\"patch_summary\")\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "6e93f09e", + "metadata": {}, + "source": [ + "## Generate Summaries\n", + "\n", + "Generate patch summaries for new diffs that do not have summaries yet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03accee8", + "metadata": {}, + "outputs": [], + "source": [ + "from bugbug.tools.core.platforms.phabricator import PhabricatorReviewData\n", + "from bugbug.tools.patch_summarization.agent import PatchSummarizationTool\n", + "\n", + "summarizer = PatchSummarizationTool.create()\n", + "review_data = PhabricatorReviewData()\n", + "\n", + "\n", + "@weave.op()\n", + "def summarize(diff_id):\n", + " patch = review_data.get_patch_by_id(diff_id)\n", + " summaries[diff_id] = summarizer.run(patch)\n", + "\n", + "\n", + "@weave.op()\n", + "def generate_summaries(diff_ids):\n", + " with weave.ThreadPoolExecutor() as exc:\n", + " exc.map(summarize, diff_ids)\n", + "\n", + "\n", + "diff_ids_with_no_summaries = (\n", + " df.query(\"diff_id not in @summaries\")[\"diff_id\"].unique().tolist()\n", + ")\n", + "\n", + "generate_summaries(diff_ids_with_no_summaries)" + ] + }, + { + "cell_type": "markdown", + "id": "3c4f93a4", + "metadata": {}, + "source": [ + "## Save the Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5041136", + "metadata": {}, + "outputs": [], + "source": [ + "examples = [\n", + " {\n", + " \"diff_id\": diff_id,\n", + " \"revision_id\": revision_id,\n", + " \"patch_summary\": summaries[diff_id],\n", + " \"ground_truth_comments\": ground_truth_comments,\n", + " }\n", + " for (diff_id, revision_id), ground_truth_comments in df.groupby(\n", + " [\"diff_id\", \"revision_id\"]\n", + " )\n", + " .apply(lambda x: x.to_dict(orient=\"records\"))\n", + " .to_dict()\n", + " .items()\n", + " if diff_id in summaries\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d132783a", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = weave.Dataset(\n", + " name=DATASET_NAME,\n", + " description=\"Code review evaluation dataset with ground truth comments and patch summaries.\",\n", + " rows=examples,\n", + ")\n", + "\n", + "_ = weave.publish(dataset)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bugbug (3.12.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/code_review_evaluation.ipynb b/notebooks/code_review_evaluation.ipynb new file mode 100644 index 0000000000..3291d491e8 --- /dev/null +++ b/notebooks/code_review_evaluation.ipynb @@ -0,0 +1,234 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Code Review Tool Evaluation\n", + "\n", + "This notebook runs W&B Weave evaluations for the code review tool." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import weave\n", + "\n", + "PROJECT_NAME = \"bugbug-code-review-eval\"\n", + "\n", + "_ = weave.init(PROJECT_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = weave.ref(\"code_review_eval_legacy\").get()\n", + "\n", + "print(f\"Dataset has {len(dataset.rows)} examples\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import cached_property\n", + "\n", + "from bugbug.tools.code_review.agent import CodeReviewTool\n", + "\n", + "\n", + "class CodeReviewModel(weave.Model):\n", + " \"\"\"Weave Model wrapper for CodeReviewTool.\"\"\"\n", + "\n", + " @cached_property\n", + " def tool(self):\n", + " return CodeReviewTool.create()\n", + "\n", + " @weave.op()\n", + " def invoke(self, diff_id: int, patch_summary: str) -> dict:\n", + " patch = self.tool.review_data.get_patch_by_id(diff_id)\n", + " try:\n", + " comments = self.tool.generate_review_comments(patch, patch_summary)\n", + " return {\n", + " \"comments\": comments,\n", + " \"error\": None,\n", + " }\n", + " except Exception as e:\n", + " return {\n", + " \"comments\": [],\n", + " \"error\": str(e),\n", + " }\n", + "\n", + "\n", + "model = CodeReviewModel()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bugbug.tools.code_review.scorer import BasicMetricsScorer, LLMCommentMatchingScorer\n", + "\n", + "evaluation = weave.Evaluation(\n", + " dataset=dataset,\n", + " scorers=[BasicMetricsScorer(), LLMCommentMatchingScorer()],\n", + ")\n", + "\n", + "results = await evaluation.evaluate(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Extract key metrics for visualization\n", + "metrics = {\n", + " \"Recall (Valid)\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n", + " \"recall_valid\", 0\n", + " ),\n", + " \"Recall (Invalid)\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n", + " \"recall_invalid\", 0\n", + " ),\n", + " \"Missed Valid Rate\": results.get(\"LLMCommentMatchingScorer\", {}).get(\n", + " \"missed_valid_rate\", 0\n", + " ),\n", + " \"Error Rate\": results.get(\"BasicMetricsScorer\", {}).get(\"error_rate\", 0),\n", + "}\n", + "\n", + "# Bar chart\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "bars = ax.bar(\n", + " metrics.keys(), metrics.values(), color=[\"green\", \"orange\", \"red\", \"gray\"]\n", + ")\n", + "ax.set_ylabel(\"Rate\")\n", + "ax.set_title(\"Code Review Evaluation Metrics\")\n", + "ax.set_ylim(0, 1)\n", + "\n", + "# Add value labels on bars\n", + "for bar, value in zip(bars, metrics.values()):\n", + " ax.text(\n", + " bar.get_x() + bar.get_width() / 2,\n", + " bar.get_height() + 0.02,\n", + " f\"{value:.2%}\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " )\n", + "\n", + "plt.xticks(rotation=45, ha=\"right\")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Comment counts comparison\n", + "basic_metrics = results.get(\"BasicMetricsScorer\", {})\n", + "\n", + "counts = {\n", + " \"Generated\": basic_metrics.get(\"total_generated_comments\", 0),\n", + " \"Ground Truth (Valid)\": basic_metrics.get(\"total_ground_truth_valid\", 0),\n", + " \"Ground Truth (Invalid)\": basic_metrics.get(\"total_ground_truth_invalid\", 0),\n", + "}\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 5))\n", + "bars = ax.bar(counts.keys(), counts.values(), color=[\"blue\", \"green\", \"red\"])\n", + "ax.set_ylabel(\"Count\")\n", + "ax.set_title(\"Comment Counts\")\n", + "\n", + "for bar, value in zip(bars, counts.values()):\n", + " ax.text(\n", + " bar.get_x() + bar.get_width() / 2,\n", + " bar.get_height() + 0.5,\n", + " str(int(value)),\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " )\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. View in W&B\n", + "\n", + "Visit [W&B Weave](https://wandb.ai) to see detailed traces, compare evaluations, and explore individual predictions." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bugbug (3.12.7)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.txt b/requirements.txt index c3d5603aac..9e6870eb74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,5 +41,6 @@ taskcluster==94.1.1 tenacity==9.1.2 tqdm==4.67.1 unidiff==0.7.5 +weave>=0.50.0 xgboost==3.1.2 zstandard==0.25.0 diff --git a/scripts/code_review_tool_evaluator.py b/scripts/code_review_tool_evaluator.py deleted file mode 100644 index 085035d834..0000000000 --- a/scripts/code_review_tool_evaluator.py +++ /dev/null @@ -1,613 +0,0 @@ -# -*- coding: utf-8 -*- -# This Source Code Form is subject to the terms of the Mozilla Public -# License, v. 2.0. If a copy of the MPL was not distributed with this file, -# You can obtain one at http://mozilla.org/MPL/2.0/. - -"""This script evaluates different variants of the code review tool. - -Before running this script, you may need to set the following environment -variables: - - BUGBUG_PHABRICATOR_URL - - BUGBUG_PHABRICATOR_TOKEN - - BUGBUG_*_API_KEY (replace * with your LLM provider) - - BUGBUG_QDRANT_API_KEY - - BUGBUG_QDRANT_LOCATION - -To specify different variants to evaluate, please modify the get_tool_variants -function. -""" - -import os -from collections import defaultdict -from datetime import datetime, timedelta - -import numpy as np -import pandas as pd -from langchain_classic.prompts import PromptTemplate -from langchain_core.runnables import RunnableSequence -from tabulate import tabulate - -from bugbug import db, phabricator, utils -from bugbug.code_search.mozilla import FunctionSearchMozilla -from bugbug.tools import code_review -from bugbug.tools.code_review.utils import parse_model_output -from bugbug.tools.core import llms -from bugbug.tools.core.exceptions import ModelResultError -from bugbug.vectordb import QdrantVectorDB - -VERBOSE_CODE_REVIEW = False - - -EVALUATION_TEMPLATE = """Your are an expert in code review at Mozilla Firefox. - -**Task**: - -Match two sets of code review comments to identify redundant comments. - -**Instructions**: - - 1. **Consider the following about all comments**: - - The comments are related to the same code patch. - - The comments may be written in different styles. - - 2. **Understand what each comment is addressing**: - - Read the comments in both sets. - - Understand the issue that each comment is addressing. - - 3. **Check for matches**: - - If you find a comment in the old set that is addressing the same issue as a comment in the new set, link them as redundant. - - The comments may not be identical, but they should be addressing the same issue. - - The level of detail in the comments may vary. - - 4. **Output format**: - - Output a list of matched comments. - - Include the comment IDs only in the output. - - Each element in the list should be an object with two keys: `old_comment_id` and `new_comment_id`. - - No explanation is needed in the output, only the IDs of the matched comments. - - The output should be a valid json only. - - -**Output example**: - - [ - {{"old_comment_id": 1, "new_comment_id": 3}}, - {{"old_comment_id": 4, "new_comment_id": 2}}, - ] - -**First set of comments (old comments)**: - -{old_comments} - -**Second set of comments (new comments)**: - -{new_comments} -""" - - -class FeedbackEvaluator: - def __init__(self, evaluation_dataset: str): - self.evaluated_comments = pd.read_csv(evaluation_dataset) - - llm = llms.create_openai_llm() - evaluate_comments_prompt = PromptTemplate.from_template(EVALUATION_TEMPLATE) - self.evaluation_chain = RunnableSequence(evaluate_comments_prompt, llm) - - def evaluate_diff_comments( - self, - diff_id: int, - new_comments: list[code_review.InlineComment], - ) -> list[dict]: - diff_evaluated_comments = self.evaluated_comments[ - self.evaluated_comments["diff_id"] == diff_id - ].reset_index() - diff_evaluated_comments["evaluation"] = np.where( - diff_evaluated_comments["evaluation"].isin(["CORRECT", "VALID_REDUNDANT"]), - "VALID", - "INVALID", - ) - - output = self.evaluation_chain.invoke( - { - "old_comments": [ - { - "id": i, - "content": raw["comment"], - "file": raw["file_path"], - } - for i, raw in diff_evaluated_comments.iterrows() - ], - "new_comments": [ - { - "id": i, - "content": comment.content, - "file": comment.filename, - } - for i, comment in enumerate(new_comments) - ], - } - ) - - matches = parse_model_output(output.content) - - results = [ - { - "new_comment": comment.content, - "new_comment_order": comment.order, - "old_comments_count": 0, - "matched": False, - } - for comment in new_comments - ] - seen_old_comments = set() - - for match in matches: - old_index = match["old_comment_id"] - new_index = match["new_comment_id"] - - evaluated_comment = diff_evaluated_comments.iloc[old_index] - new_comment = new_comments[new_index] - - if evaluated_comment["file_path"] != new_comment.filename: - print( - "File mismatch:", - evaluated_comment["file_path"], - new_comment.filename, - ) - continue - - current_result = results[new_index] - - current_result["evaluation"] = ( - "MIXED" - if ( - "evaluation" in current_result - and current_result["evaluation"] != evaluated_comment["evaluation"] - ) - else evaluated_comment["evaluation"] - ) - - if "old_comment" in current_result: - current_result["old_comment"] += ( - f"\n\n-------------\n\n{evaluated_comment['comment']}" - ) - else: - current_result["old_comment"] = evaluated_comment["comment"] - - current_result["old_comments_count"] += 1 - current_result["matched"] = True - - seen_old_comments.add(old_index) - - for i, raw in diff_evaluated_comments.iterrows(): - if i in seen_old_comments: - continue - - results.append( - { - "old_comment": raw["comment"], - "evaluation": raw["evaluation"], - "old_comments_count": 1, - } - ) - - self.print_evaluation_matches(results) - - return results - - @staticmethod - def print_evaluation_matches(matching_results: list[dict]): - print( - tabulate( - [ - ( - result.get("new_comment", ""), - result.get("old_comment", ""), - result.get("evaluation", ""), - ) - for result in matching_results - ], - tablefmt="mixed_grid", - headers=[ - "New Comment", - "Old Comment", - "Evaluation", - ], - maxcolwidths=[ - 60, - 60, - 20, - ], - ) - ) - - -def get_tool_variants( - variants: list[str], -) -> list[tuple[str, code_review.CodeReviewTool]]: - """Returns a list of tool variants to evaluate. - - Returns: - List of tuples, where each tuple contains the name of the variant and - and instance of the code review tool to evaluate. - """ - # Step 1: we start with instantiating the dependencies based on the selected - # variants. - repo_dir = "../mozilla-unified" - - def get_file(commit_hash, path): - r = utils.get_session("hgmo").get( - f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}", - headers={ - "User-Agent": utils.get_user_agent(), - }, - ) - r.raise_for_status() - return r.text - - function_search = FunctionSearchMozilla(repo_dir, get_file, True) - - review_comments_db = code_review.ReviewCommentsDB(QdrantVectorDB("diff_comments")) - - suggestions_feedback_db = code_review.SuggestionsFeedbackDB( - QdrantVectorDB("suggestions_feedback") - ) - - # Step 2: we create the selected tool variants. - - tool_variants = [] - - if "claude" in variants: - tool_variants.append( - ( - "Claude", - code_review.CodeReviewTool.create( - function_search=function_search, - review_comments_db=review_comments_db, - suggestions_feedback_db=suggestions_feedback_db, - verbose=VERBOSE_CODE_REVIEW, - ), - ) - ) - - if "gpt" in variants: - llm = llms.create_openai_llm() - tool_variants.append( - ( - "GPT", - code_review.CodeReviewTool.create( - llm=llm, - summarization_llm=llm, - filtering_llm=llm, - function_search=function_search, - review_comments_db=review_comments_db, - suggestions_feedback_db=suggestions_feedback_db, - verbose=VERBOSE_CODE_REVIEW, - ), - ) - ) - - return tool_variants - - -def get_review_requests_sample(since: timedelta, limit: int): - assert db.download(phabricator.REVISIONS_DB) - - start_date = (datetime.now() - since).timestamp() - MOZILLA_CENTRAL_PHID = "PHID-REPO-saax4qdxlbbhahhp2kg5" - - n = 0 - for review_request in phabricator.get_revisions(): - if ( - review_request["fields"]["repositoryPHID"] != MOZILLA_CENTRAL_PHID - or review_request["fields"]["dateCreated"] <= start_date - ): - continue - - if n >= limit >= 0: - break - - yield review_request["id"] - n += 1 - - -def print_prettified_comments(comments: list[code_review.InlineComment]): - if not comments: - print("No comments to show.") - return - - print( - tabulate( - [ - ( - comment.filename, - comment.end_line, - comment.content, - ) - for comment in comments - ], - headers=[ - "File", - "Line", - "Comment", - ], - maxcolwidths=[ - 30, - 10, - 100, - ], - ), - ) - - -def get_latest_evaluation_results_file(results_dir: str | None): - import glob - import os - - files = glob.glob("evaluation_results_*#*.csv", root_dir=results_dir) - if not files: - raise FileNotFoundError("No evaluation results file found.") - - latests_files = max(files) - if results_dir: - return os.path.join(results_dir, latests_files) - - return latests_files - - -def get_ongoing_evaluation_results_file(results_dir: str | None): - import glob - import os - - base_file = get_latest_evaluation_results_file(results_dir) - files = [ - file - for file in glob.glob("evaluation_results_*.csv", root_dir=results_dir) - if "#" not in file and file > base_file - ] - if not files: - raise FileNotFoundError("No ongoing evaluation results file found.") - - latests_file = max(files) - if results_dir: - return os.path.join(results_dir, latests_file) - - return latests_file - - -def main(args): - review_platform = "phabricator" - review_data: code_review.ReviewData = code_review.review_data_classes[ - review_platform - ]() - - tool_variants = get_tool_variants(args.variants) - - evaluator = FeedbackEvaluator(args.evaluation_dataset) - - result_file = os.path.join( - args.results_dir, - "code_review_tool_evaluator.csv", - ) - is_first_result = not os.path.exists(result_file) - - if is_first_result: - evaluation_results_file = os.path.join( - args.results_dir, - f"evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv", - ) - seen_patches = set() - else: - evaluation_results_file = get_ongoing_evaluation_results_file(args.results_dir) - seen_patches = set(pd.read_csv(evaluation_results_file)["diff_id"].to_list()) - - result_unique_columns = ["Review Request ID", "File", "Line", "Comment Number"] - result_all_columns = result_unique_columns + [ - f"{title} ({variant_name})" - for variant_name, _ in tool_variants - for title in ("Comment", "Evaluation") - ] - evaluation_result_all_columns = [ - "variant_name", - "revision_id", - "diff_id", - "new_comment", - "new_comment_order", - "old_comments_count", - "matched", - "old_comment", - "evaluation", - ] - - selected_review_requests = [] - if args.diff_ids: - selected_review_requests = ( - ("n/a", code_review.ReviewRequest(diff_id)) for diff_id in args.diff_ids - ) - elif args.review_request_ids: - selected_review_requests = ( - (review_request_id, review_data.get_review_request_by_id(review_request_id)) - for review_request_id in args.review_request_ids - ) - elif args.evaluation_strategy == "random": - print("No review request IDs specified. Selecting a random sample.") - selected_review_requests = ( - (revision_id, code_review.ReviewRequest(diff_id)) - for revision_id, diff_id in evaluator.evaluated_comments.query( - "evaluation == 'CORRECT'" - )[["revision_id", "diff_id"]] - .drop_duplicates() - .sample(20) - .itertuples(index=False) - ) - elif args.evaluation_strategy == "same": - selected_review_requests = ( - (revision_id, code_review.ReviewRequest(diff_id)) - for revision_id, diff_id in pd.read_csv( - get_latest_evaluation_results_file(args.results_dir), - )[["revision_id", "diff_id"]] - .drop_duplicates() - .itertuples(name=None, index=False) - ) - else: - raise ValueError( - "Please specify either --diff-id or --revision-id. Alternatively, use --evaluation-strategy." - ) - - for review_request_id, review_request in selected_review_requests: - if review_request_id in [227266, 233414]: - print( - f"Skipping Review Request ID {review_request_id} because it is known to cause issues." - ) - continue - - if review_request.patch_id in seen_patches: - print( - f"Skipping Review Request ID {review_request_id} (Diff ID {review_request.patch_id}) because it was already evaluated." - ) - continue - - print("---------------------------------------------------------") - print(f"Review Request ID: {review_request_id}") - print(f"Patch ID: {review_request.patch_id}") - patch = review_data.get_patch_by_id(review_request.patch_id) - print("---------------------------------------------------------") - - if len(patch.raw_diff) > 20_000: - print("Skipping the patch because it is too large.") - continue - - all_variants_results = [] - all_variants_evaluation_results = [] - for variant_name, tool in tool_variants: - print(f"\n\nVariant: {variant_name}\n") - try: - comments = tool.run(patch) - except code_review.FileNotInPatchError as e: - print("Error while running the tool:", e) - continue - except code_review.LargeDiffError: - print("Skipping the patch because it is too large.") - continue - except ModelResultError as e: - print("Error while running the tool:", e) - continue - - print_prettified_comments(comments) - comment_per_line_counter = defaultdict(int) - - evaluation = evaluator.evaluate_diff_comments( - review_request.patch_id, comments - ) - - all_variants_evaluation_results.extend( - { - "variant_name": variant_name, - "revision_id": review_request_id, - "diff_id": review_request.patch_id, - **row, - } - for row in evaluation - ) - - for i, comment in enumerate(comments): - key = (review_request_id, comment.filename, comment.end_line) - comment_per_line_counter[key] += 1 - - all_variants_results.append( - { - "Review Request ID": review_request_id, - "File": comment.filename, - "Line": comment.end_line, - "Comment Number": comment_per_line_counter[key], - f"Comment ({variant_name})": comment.content, - f"Evaluation ({variant_name})": evaluation[i].get("evaluation"), - } - ) - - df = ( - pd.DataFrame(all_variants_results, columns=result_all_columns) - .groupby(result_unique_columns) - .first() - ) - df.to_csv( - result_file, - header=is_first_result, - mode="w" if is_first_result else "a", - ) - - df = pd.DataFrame( - all_variants_evaluation_results, columns=evaluation_result_all_columns - ) - df.to_csv( - evaluation_results_file, - index=False, - header=is_first_result, - mode="w" if is_first_result else "a", - ) - - if is_first_result: - is_first_result = False - print("You can find the results in the file:", result_file) - - print("\n\n\n") - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument( - "-r", - "--revision-id", - dest="review_request_ids", - action="append", - help="if specified, run only the selected Revision ID(s)", - metavar="REVISION_ID", - type=int, - ) - parser.add_argument( - "-d", - "--diff-id", - dest="diff_ids", - action="append", - help="if specified, run only the selected Diff ID(s)", - metavar="DIFF_ID", - type=int, - ) - parser.add_argument( - "--evaluation-data", - dest="evaluation_dataset", - action="store", - help="the path or the URL to a evaluation dataset in CSV format", - ) - - parser.add_argument( - "--results-dir", - dest="results_dir", - action="store", - help="the directory to store the results and read previous results", - ) - - parser.add_argument( - "--evaluation-strategy", - dest="evaluation_strategy", - action="store", - help="the evaluation strategy to use", - ) - parser.add_argument( - "--variant", - dest="variants", - action="append", - help="the variants to use, use multiple times for multiple variants", - choices=["claude", "gpt"], - required=True, - ) - - args = parser.parse_args() - - if args.diff_ids and args.review_request_ids: - parser.error("Please specify either --diff-id or --revision-id, not both.") - - main(args) diff --git a/scripts/code_review_tool_evaluator_report.py b/scripts/code_review_tool_evaluator_report.py deleted file mode 100644 index 801be5bcb4..0000000000 --- a/scripts/code_review_tool_evaluator_report.py +++ /dev/null @@ -1,109 +0,0 @@ -# %% - -import pandas as pd - -import scripts.code_review_tool_evaluator as evaluator_script - -evaluation_results = pd.read_csv( - # evaluator_script.get_latest_evaluation_results_file("../evaluation_results") - evaluator_script.get_ongoing_evaluation_results_file("../evaluation_results") -) - -# %% - -variant_names = evaluation_results["variant_name"].unique() -variant_name = variant_names[0] - -df = evaluation_results[evaluation_results["variant_name"] == variant_name] - - -# %% -new_comments_count = df["new_comment"].count() -new_valid_comments = len(df[~df["new_comment"].isna() & (df["evaluation"] == "VALID")]) -new_invalid_comments = len( - df[~df["new_comment"].isna() & (df["evaluation"] == "INVALID")] -) -new_unevaluated_comments = len(df[~df["new_comment"].isna() & df["evaluation"].isna()]) - -old_comments_count = df["old_comments_count"].sum() -old_valid_comments = df[df["evaluation"] == "VALID"]["old_comments_count"].sum() -old_invalid_comments = df[df["evaluation"] == "INVALID"]["old_comments_count"].sum() - -matched_valid_comments = df[ - ~df["new_comment"].isna() - & ~df["old_comment"].isna() - & (df["evaluation"] == "VALID") -]["old_comments_count"].sum() -matched_invalid_comments = df[ - ~df["new_comment"].isna() - & ~df["old_comment"].isna() - & (df["evaluation"] == "INVALID") -]["old_comments_count"].sum() - -print("--------------------") -print("Variant Name:", variant_name) -print("--------------------") -print("New Comments:", new_comments_count) -print("New Valid Comments:", new_valid_comments) -print("New Invalid Comments:", new_invalid_comments) -print("New Unevaluated Comments:", new_unevaluated_comments) -print("--------------------") -print("Old Comments:", old_comments_count) -print("Old Valid Comments:", old_valid_comments) -print("Old Invalid Comments:", old_invalid_comments) -print("--------------------") -print( - "Recalled comments:", - (matched_valid_comments + matched_invalid_comments) / old_comments_count * 100, -) -print("Recalled valid comments:", matched_valid_comments / old_valid_comments * 100) -print( - "Recalled invalid comments:", matched_invalid_comments / old_invalid_comments * 100 -) -print("--------------------") -print( - "Missed valid comments:", - (old_valid_comments - matched_valid_comments) / old_valid_comments * 100, -) -print( - "Missed invalid comments:", - (old_invalid_comments - matched_invalid_comments) / old_invalid_comments * 100, -) - - -# %% - - -df = evaluation_results[ - evaluation_results["evaluation"].isin(["VALID", "INVALID"]) - & ~evaluation_results["new_comment"].isna() -].sort_values(by=["evaluation", "revision_id", "new_comment"]) - - -df["id"] = df["diff_id"].astype(str) + " | " + df["new_comment"].astype(str) -df_old = df[df["variant_name"] == variant_names[0]] -df_new = df[df["variant_name"] == variant_names[1]] - -in_new_but_not_in_old = df_new[~df_new["id"].isin(df_old["id"])] - -print( - "Examples of comments that were filtered by the old version but were not filtered by the new version:\n" -) -print( - in_new_but_not_in_old[["revision_id", "new_comment", "evaluation"]].to_markdown( - index=False - ) -) - - -in_old_but_not_in_new = df_old[~df_old["id"].isin(df_new["id"])] -print( - "\n\nExamples of comments that were filtered by the new version but were not filtered by the old version:\n" -) -print( - in_old_but_not_in_new[["revision_id", "new_comment", "evaluation"]].to_markdown( - index=False - ) -) - -# %%