3232from bugbug .tools import code_review
3333from bugbug .tools .code_review .utils import parse_model_output
3434from bugbug .tools .core import llms
35+ from bugbug .tools .core .exceptions import ModelResultError
3536from bugbug .vectordb import QdrantVectorDB
3637
3738code_review .TARGET_SOFTWARE = "Mozilla Firefox"
@@ -221,7 +222,9 @@ def print_evaluation_matches(matching_results: list[dict]):
221222 )
222223
223224
224- def get_tool_variants () -> list [tuple [str , code_review .CodeReviewTool ]]:
225+ def get_tool_variants (
226+ variants : list [str ],
227+ ) -> list [tuple [str , code_review .CodeReviewTool ]]:
225228 """Returns a list of tool variants to evaluate.
226229
227230 Returns:
@@ -254,31 +257,33 @@ def get_file(commit_hash, path):
254257
255258 tool_variants = []
256259
257- tool_variants .append (
258- (
259- "Claude" ,
260- code_review .CodeReviewTool (
261- llm = llms .create_anthropic_llm (),
262- function_search = function_search ,
263- review_comments_db = review_comments_db ,
264- suggestions_feedback_db = suggestions_feedback_db ,
265- verbose = VERBOSE_CODE_REVIEW ,
266- ),
260+ if "claude" in variants :
261+ tool_variants .append (
262+ (
263+ "Claude" ,
264+ code_review .CodeReviewTool (
265+ llm = llms .create_anthropic_llm (),
266+ function_search = function_search ,
267+ review_comments_db = review_comments_db ,
268+ suggestions_feedback_db = suggestions_feedback_db ,
269+ verbose = VERBOSE_CODE_REVIEW ,
270+ ),
271+ )
267272 )
268- )
269273
270- tool_variants .append (
271- (
272- "GPT" ,
273- code_review .CodeReviewTool (
274- llm = llms .create_openai_llm (),
275- function_search = function_search ,
276- review_comments_db = review_comments_db ,
277- suggestions_feedback_db = suggestions_feedback_db ,
278- verbose = VERBOSE_CODE_REVIEW ,
279- ),
274+ if "gpt" in variants :
275+ tool_variants .append (
276+ (
277+ "GPT" ,
278+ code_review .CodeReviewTool (
279+ llm = llms .create_openai_llm (),
280+ function_search = function_search ,
281+ review_comments_db = review_comments_db ,
282+ suggestions_feedback_db = suggestions_feedback_db ,
283+ verbose = VERBOSE_CODE_REVIEW ,
284+ ),
285+ )
280286 )
281- )
282287
283288 return tool_variants
284289
@@ -348,25 +353,52 @@ def get_latest_evaluation_results_file(results_dir: str | None):
348353 return latests_files
349354
350355
356+ def get_ongoing_evaluation_results_file (results_dir : str | None ):
357+ import glob
358+ import os
359+
360+ base_file = get_latest_evaluation_results_file (results_dir )
361+ files = [
362+ file
363+ for file in glob .glob ("evaluation_results_*.csv" , root_dir = results_dir )
364+ if "#" not in file and file > base_file
365+ ]
366+ if not files :
367+ raise FileNotFoundError ("No ongoing evaluation results file found." )
368+
369+ latests_file = max (files )
370+ if results_dir :
371+ return os .path .join (results_dir , latests_file )
372+
373+ return latests_file
374+
375+
351376def main (args ):
352377 review_platform = "phabricator"
353378 review_data : code_review .ReviewData = code_review .review_data_classes [
354379 review_platform
355380 ]()
356381
357- tool_variants = get_tool_variants ()
382+ tool_variants = get_tool_variants (args . variants )
358383
359384 evaluator = FeedbackEvaluator (args .evaluation_dataset )
360385
361- is_first_result = True
362386 result_file = os .path .join (
363387 args .results_dir ,
364388 "code_review_tool_evaluator.csv" ,
365389 )
366- evaluation_results_file = os .path .join (
367- args .results_dir ,
368- f"evaluation_results_{ datetime .now ().strftime ('%Y-%m-%d_%H-%M-%S' )} .csv" ,
369- )
390+ is_first_result = not os .path .exists (result_file )
391+
392+ if is_first_result :
393+ evaluation_results_file = os .path .join (
394+ args .results_dir ,
395+ f"evaluation_results_{ datetime .now ().strftime ('%Y-%m-%d_%H-%M-%S' )} .csv" ,
396+ )
397+ seen_patches = set ()
398+ else :
399+ evaluation_results_file = get_ongoing_evaluation_results_file (args .results_dir )
400+ seen_patches = set (pd .read_csv (evaluation_results_file )["diff_id" ].to_list ())
401+
370402 result_unique_columns = ["Review Request ID" , "File" , "Line" , "Comment Number" ]
371403 result_all_columns = result_unique_columns + [
372404 f"{ title } ({ variant_name } )"
@@ -421,6 +453,18 @@ def main(args):
421453 )
422454
423455 for review_request_id , review_request in selected_review_requests :
456+ if review_request_id in [227266 , 233414 ]:
457+ print (
458+ f"Skipping Review Request ID { review_request_id } because it is known to cause issues."
459+ )
460+ continue
461+
462+ if review_request .patch_id in seen_patches :
463+ print (
464+ f"Skipping Review Request ID { review_request_id } (Diff ID { review_request .patch_id } ) because it was already evaluated."
465+ )
466+ continue
467+
424468 print ("---------------------------------------------------------" )
425469 print (f"Review Request ID: { review_request_id } " )
426470 print (f"Patch ID: { review_request .patch_id } " )
@@ -443,6 +487,9 @@ def main(args):
443487 except code_review .LargeDiffError :
444488 print ("Skipping the patch because it is too large." )
445489 continue
490+ except ModelResultError as e :
491+ print ("Error while running the tool:" , e )
492+ continue
446493
447494 print_prettified_comments (comments )
448495 comment_per_line_counter = defaultdict (int )
@@ -548,6 +595,14 @@ def main(args):
548595 action = "store" ,
549596 help = "the evaluation strategy to use" ,
550597 )
598+ parser .add_argument (
599+ "--variant" ,
600+ dest = "variants" ,
601+ action = "append" ,
602+ help = "the variants to use, use multiple times for multiple variants" ,
603+ choices = ["claude" , "gpt" ],
604+ required = True ,
605+ )
551606
552607 args = parser .parse_args ()
553608
0 commit comments