Merge branch 'feat/feedback-loop-for-unmatched-test-results' of github.com:codeflash-ai/codeflash into feat/feedback-loop-for-unmatched-test-results

mohammedahmed18 · mohammedahmed18 · commit 8a28d0d33404 · 2025-12-01T05:09:28.000+02:00
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -27,7 +27,7 @@
 
     from codeflash.discovery.functions_to_optimize import FunctionToOptimize
     from codeflash.models.ExperimentMetadata import ExperimentMetadata
-    from codeflash.models.models import AIServiceRefinerRequest
+    from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest
     from codeflash.result.explanation import Explanation
 
 
@@ -294,6 +294,59 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]
         console.rule()
         return []
 
+    def optimize_python_code_repair(self, request: list[AIServiceCodeRepairRequest]) -> list[OptimizedCandidate]:
+        """Optimize the given python code for performance by making a request to the Django endpoint.
+
+        Args:
+        request: A list of optimization candidate details for refinement
+
+        Returns:
+        -------
+        - List[OptimizationCandidate]: A list of Optimization Candidates.
+
+        """
+        payload = [
+            {
+                "optimization_id": opt.optimization_id,
+                "original_source_code": opt.original_source_code,
+                "modified_source_code": opt.modified_source_code,
+                "trace_id": opt.trace_id,
+            }
+            for opt in request
+        ]
+        # logger.debug(f"Repair {len(request)} optimizations…")
+        console.rule()
+        try:
+            response = self.make_ai_service_request("/code_repair", payload=payload, timeout=120)
+        except requests.exceptions.RequestException as e:
+            logger.exception(f"Error generating optimization repair: {e}")
+            ph("cli-optimize-error-caught", {"error": str(e)})
+            return []
+
+        if response.status_code == 200:
+            refined_optimizations = response.json()["code_repairs"]
+            logger.debug(f"Generated {len(refined_optimizations)} candidate refinements.")
+            console.rule()
+
+            refinements = self._get_valid_candidates(refined_optimizations)
+            return [
+                OptimizedCandidate(
+                    source_code=c.source_code,
+                    explanation=c.explanation,
+                    optimization_id=c.optimization_id[:-4] + "cdrp",
+                )
+                for c in refinements
+            ]
+
+        try:
+            error = response.json()["error"]
+        except Exception:
+            error = response.text
+        logger.error(f"Error generating optimized candidates: {response.status_code} - {error}")
+        ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error})
+        console.rule()
+        return []
+
     def get_new_explanation(  # noqa: D417
         self,
         source_code: str,
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -48,6 +48,15 @@ class AIServiceRefinerRequest:
     function_references: str | None = None
 
 
+@dataclass(frozen=True)
+class AIServiceCodeRepairRequest:
+    optimization_id: str
+    original_source_code: str
+    modified_source_code: str
+    test_details: str
+    trace_id: str
+
+
 # If the method spam is in the class Ham, which is at the top level of the module eggs in the package foo, the fully
 # qualified name of the method is foo.eggs.Ham.spam, its qualified name is Ham.spam, and its name is spam. The full name
 # of the module is foo.eggs.
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -13,6 +13,7 @@
 from typing import TYPE_CHECKING
 
 import libcst as cst
+import sentry_sdk
 from rich.console import Group
 from rich.panel import Panel
 from rich.syntax import Syntax
@@ -69,6 +70,7 @@
 from codeflash.lsp.lsp_message import LspCodeMessage, LspMarkdownMessage, LSPMessageId
 from codeflash.models.ExperimentMetadata import ExperimentMetadata
 from codeflash.models.models import (
+    AIServiceCodeRepairRequest,
     BestOptimization,
     CodeOptimizationContext,
     GeneratedTests,
@@ -589,6 +591,28 @@ def determine_best_candidate(
                     optimized_runtimes[candidate.optimization_id] = None
                     is_correct[candidate.optimization_id] = False
                     speedup_ratios[candidate.optimization_id] = None
+                    fail_value = run_results.value
+                    if (
+                        fail_value != "Test results did not match the test results of the original code."
+                        and len(future_all_refinements) <= 3
+                        and not candidate.optimization_id.endswith("cdrp")
+                    ):
+                        # # queue corresponding code repair optimization for best optimization
+                        future_all_refinements.append(
+                            self.code_repair_optimizations(
+                                original_source_code=candidate,
+                                modified_source_code=code_context,
+                                original_code_baseline=original_code_baseline,
+                                test_details="test_details",
+                                code_context=code_context,
+                                trace_id=self.function_trace_id[:-4] + exp_type
+                                if self.experiment_id
+                                else self.function_trace_id,
+                                ai_service_client=ai_service_client,
+                                executor=self.executor,
+                                function_references=function_references,
+                            )
+                        )
                 else:
                     candidate_result: OptimizedCandidateResult = run_results.unwrap()
                     best_test_runtime = candidate_result.best_test_runtime
@@ -672,21 +696,21 @@ def determine_best_candidate(
                             async_throughput=candidate_result.async_throughput,
                         )
                         valid_optimizations.append(best_optimization)
-                        # queue corresponding refined optimization for best optimization
-                        if not candidate.optimization_id.endswith("refi"):
-                            future_all_refinements.append(
-                                self.refine_optimizations(
-                                    valid_optimizations=[best_optimization],
-                                    original_code_baseline=original_code_baseline,
-                                    code_context=code_context,
-                                    trace_id=self.function_trace_id[:-4] + exp_type
-                                    if self.experiment_id
-                                    else self.function_trace_id,
-                                    ai_service_client=ai_service_client,
-                                    executor=self.executor,
-                                    function_references=function_references,
-                                )
-                            )
+                        # # queue corresponding refined optimization for best optimization
+                        # if not candidate.optimization_id.endswith("refi"):
+                        #     future_all_refinements.append(
+                        #         self.refine_optimizations(
+                        #             valid_optimizations=[best_optimization],
+                        #             original_code_baseline=original_code_baseline,
+                        #             code_context=code_context,
+                        #             trace_id=self.function_trace_id[:-4] + exp_type
+                        #             if self.experiment_id
+                        #             else self.function_trace_id,
+                        #             ai_service_client=ai_service_client,
+                        #             executor=self.executor,
+                        #             function_references=function_references,
+                        #         )
+                        #     )
                     else:
                         # For async functions, prioritize throughput metrics over runtime even for slow candidates
                         is_async = (
@@ -839,6 +863,26 @@ def refine_optimizations(
         ]
         return executor.submit(ai_service_client.optimize_python_code_refinement, request=request)
 
+    def code_repair_optimizations(
+        self,
+        original_source_code: str,
+        modified_source_code: str,
+        test_details: str,
+        trace_id: str,
+        ai_service_client: AiServiceClient,
+        executor: concurrent.futures.ThreadPoolExecutor,
+    ) -> concurrent.futures.Future:
+        request = [
+            AIServiceCodeRepairRequest(
+                optimization_id="",
+                original_source_code=original_source_code,
+                modified_source_code=modified_source_code,
+                test_details=test_details,
+                trace_id=trace_id,
+            )
+        ]
+        return executor.submit(ai_service_client.optimize_python_code_repair, request=request)
+
     def log_successful_optimization(
         self, explanation: Explanation, generated_tests: GeneratedTestsList, exp_type: str
     ) -> None:
@@ -1813,6 +1857,7 @@ def run_optimized_candidate(
                 )
             )
             console.rule()
+            # print(type(code_context), type(candidate))
             match, diffs = compare_test_results(baseline_results.behavior_test_results, candidate_behavior_results)
             if match:
                 logger.info("h3|Test results matched ✅")
@@ -1823,15 +1868,29 @@ def run_optimized_candidate(
                     # if the test unmatched percentage is greater than 50%, we can't fix it
                     return self.get_results_not_matched_error()
 
-                print(f"should try to fix it, diffs: {diffs}")
-                # with the parsed test results diff ask the llm to fix the candidate to match the test results of the original code, and run again
-                # self.run_optimized_candidate(
-                #     optimization_candidate_index=optimization_candidate_index,
-                #     baseline_results=baseline_results,
-                #     original_helper_code=original_helper_code,
-                #     file_path_to_helper_classes=file_path_to_helper_classes,
-                # )
-                return self.get_results_not_matched_error()
+                logger.info("running code repair...")
+                # not sure if all return types will be convertible to string
+                diff_per_test_fn = {}
+                for diff in diffs:
+                    try:
+                        diff_per_test_fn[diff.test_src_code] = (
+                            diff_per_test_fn.setdefault(diff.test_src_code, "")
+                            + f"Expected Value: {diff.original_value!s}\nActual Value: {diff.candidate_value!s}\nError String:{diff.pytest_error}\n"
+                        )
+
+                    except Exception as e:
+                        sentry_sdk.capture_exception(e)
+                        logger.exception(e)
+                        return self.get_results_not_matched_error()
+                try:
+                    test_issues = "\n".join(
+                        f"{test_fn_def}\n{value}" for test_fn_def, value in diff_per_test_fn.items()
+                    )
+                except Exception as e:
+                    sentry_sdk.capture_exception(e)
+                    logger.exception(e)
+                    return self.get_results_not_matched_error()
+                return Failure(test_issues)
 
             logger.info(f"loading|Running performance tests for candidate {optimization_candidate_index}...")
 
diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
@@ -118,7 +118,6 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
             test_diff.original_value = original_test_result.stdout
             test_diff.candidate_value = cdd_test_result.stdout
             test_diffs.append(test_diff)
-            break
 
         if original_test_result.test_type in {
             TestType.EXISTING_UNIT_TEST,
@@ -130,7 +129,7 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
             test_diff.original_value = original_test_result.did_pass
             test_diff.candidate_value = cdd_test_result.did_pass
             test_diffs.append(test_diff)
-            break
+
     sys.setrecursionlimit(original_recursion_limit)
     if did_all_timeout:
         return False, test_diffs