Merge pull request #874 from codeflash-ai/references-context

misrasaurabh1 · web-flow · commit d3788ec2c31c · 2025-11-04T15:25:01.000-08:00
Python Version and function references as additional context
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
@@ -28,8 +28,8 @@ jobs:
       - name: install dependencies
         run: uv sync
 
-      - name: Install test-only dependencies (Python 3.13)
-        if: matrix.python-version == '3.13'
+      - name: Install test-only dependencies (Python 3.9 and 3.13)
+        if: matrix.python-version == '3.9' || matrix.python-version == '3.13'
         run: uv sync --group tests
 
       - name: Unit tests
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -255,6 +255,8 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]
                 "optimized_code_runtime": opt.optimized_code_runtime,
                 "speedup": opt.speedup,
                 "trace_id": opt.trace_id,
+                "function_references": opt.function_references,
+                "python_version": platform.python_version(),
             }
             for opt in request
         ]
@@ -308,6 +310,7 @@ def get_new_explanation(  # noqa: D417
         original_throughput: str | None = None,
         optimized_throughput: str | None = None,
         throughput_improvement: str | None = None,
+        function_references: str | None = None,
     ) -> str:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -327,6 +330,7 @@ def get_new_explanation(  # noqa: D417
         - original_throughput: str | None - throughput for the baseline code (operations per second)
         - optimized_throughput: str | None - throughput for the optimized code (operations per second)
         - throughput_improvement: str | None - throughput improvement percentage
+        - function_references: str | None - where the function is called in the codebase
 
         Returns
         -------
@@ -349,6 +353,7 @@ def get_new_explanation(  # noqa: D417
             "original_throughput": original_throughput,
             "optimized_throughput": optimized_throughput,
             "throughput_improvement": throughput_improvement,
+            "function_references": function_references,
         }
         logger.info("loading|Generating explanation")
         console.rule()
@@ -373,7 +378,12 @@ def get_new_explanation(  # noqa: D417
         return ""
 
     def generate_ranking(  # noqa: D417
-        self, trace_id: str, diffs: list[str], optimization_ids: list[str], speedups: list[float]
+        self,
+        trace_id: str,
+        diffs: list[str],
+        optimization_ids: list[str],
+        speedups: list[float],
+        function_references: str | None = None,
     ) -> list[int] | None:
         """Optimize the given python code for performance by making a request to the Django endpoint.
 
@@ -382,6 +392,7 @@ def generate_ranking(  # noqa: D417
         - trace_id : unique uuid of function
         - diffs : list of unified diff strings of opt candidates
         - speedups : list of speedups of opt candidates
+        - function_references : where the function is called in the codebase
 
         Returns
         -------
@@ -394,6 +405,7 @@ def generate_ranking(  # noqa: D417
             "speedups": speedups,
             "optimization_ids": optimization_ids,
             "python_version": platform.python_version(),
+            "function_references": function_references,
         }
         logger.info("loading|Generating ranking")
         console.rule()
@@ -594,6 +606,7 @@ def get_optimization_review(
             "optimized_runtime": humanize_runtime(explanation.best_runtime_ns),
             "original_runtime": humanize_runtime(explanation.original_runtime_ns),
             "calling_fn_details": calling_fn_details,
+            "python_version": platform.python_version(),
         }
         console.rule()
         try:
diff --git a/codeflash/code_utils/code_extractor.py b/codeflash/code_utils/code_extractor.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import ast
+import time
 from dataclasses import dataclass
 from itertools import chain
 from pathlib import Path
@@ -1138,6 +1139,7 @@ def find_specific_function_in_file(
 def get_fn_references_jedi(
     source_code: str, file_path: Path, project_root: Path, target_function: str, target_class: str | None
 ) -> list[Path]:
+    start_time = time.perf_counter()
     function_position: CodePosition = find_specific_function_in_file(
         source_code, file_path, target_function, target_class
     )
@@ -1146,6 +1148,8 @@ def get_fn_references_jedi(
         # Get references to the function
         references = script.get_references(line=function_position.line_no, column=function_position.col_no)
         # Collect unique file paths where references are found
+        end_time = time.perf_counter()
+        logger.debug(f"Jedi for function references ran in {end_time - start_time:.2f} seconds")
         reference_files = set()
         for ref in references:
             if ref.module_path:
@@ -1163,6 +1167,7 @@ def get_fn_references_jedi(
 def get_opt_review_metrics(
     source_code: str, file_path: Path, qualified_name: str, project_root: Path, tests_root: Path
 ) -> str:
+    start_time = time.perf_counter()
     try:
         qualified_name_split = qualified_name.rsplit(".", maxsplit=1)
         if len(qualified_name_split) == 1:
@@ -1176,4 +1181,6 @@ def get_opt_review_metrics(
     except Exception as e:
         calling_fns_details = ""
         logger.debug(f"Investigate {e}")
+    end_time = time.perf_counter()
+    logger.debug(f"Got function references in {end_time - start_time:.2f} seconds")
     return calling_fns_details
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -44,6 +44,7 @@ class AIServiceRefinerRequest:
     trace_id: str
     original_line_profiler_results: str
     optimized_line_profiler_results: str
+    function_references: str | None = None
 
 
 # If the method spam is in the class Ham, which is at the top level of the module eggs in the package foo, the fully
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -244,7 +244,7 @@ def __init__(
         ) = None
         n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
         self.executor = concurrent.futures.ThreadPoolExecutor(
-            max_workers=n_tests + 2 if self.experiment_id is None else n_tests + 3
+            max_workers=n_tests + 3 if self.experiment_id is None else n_tests + 4
         )
 
     def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]:
@@ -286,6 +286,7 @@ def generate_and_instrument_tests(
             list[Path],
             set[Path],
             dict | None,
+            str,
         ]
     ]:
         """Generate and instrument tests, returning all necessary data for optimization."""
@@ -323,9 +324,14 @@ def generate_and_instrument_tests(
 
         generated_tests: GeneratedTestsList
         optimizations_set: OptimizationSet
-        count_tests, generated_tests, function_to_concolic_tests, concolic_test_str, optimizations_set = (
-            generated_results.unwrap()
-        )
+        (
+            count_tests,
+            generated_tests,
+            function_to_concolic_tests,
+            concolic_test_str,
+            optimizations_set,
+            function_references,
+        ) = generated_results.unwrap()
 
         for i, generated_test in enumerate(generated_tests.generated_tests):
             with generated_test.behavior_file_path.open("w", encoding="utf8") as f:
@@ -371,6 +377,7 @@ def generate_and_instrument_tests(
                 generated_perf_test_paths,
                 instrumented_unittests_created_for_function,
                 original_conftest_content,
+                function_references,
             )
         )
 
@@ -403,6 +410,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:
             generated_perf_test_paths,
             instrumented_unittests_created_for_function,
             original_conftest_content,
+            function_references,
         ) = test_setup_result.unwrap()
 
         baseline_setup_result = self.setup_and_establish_baseline(
@@ -437,6 +445,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:
             generated_tests=generated_tests,
             test_functions_to_remove=test_functions_to_remove,
             concolic_test_str=concolic_test_str,
+            function_references=function_references,
         )
 
         # Add function to code context hash if in gh actions
@@ -458,6 +467,7 @@ def determine_best_candidate(
         original_helper_code: dict[Path, str],
         file_path_to_helper_classes: dict[Path, set[str]],
         exp_type: str,
+        function_references: str,
     ) -> BestOptimization | None:
         best_optimization: BestOptimization | None = None
         _best_runtime_until_now = original_code_baseline.runtime
@@ -667,6 +677,7 @@ def determine_best_candidate(
                                     else self.function_trace_id,
                                     ai_service_client=ai_service_client,
                                     executor=self.executor,
+                                    function_references=function_references,
                                 )
                             )
                     else:
@@ -753,6 +764,7 @@ def determine_best_candidate(
                 optimization_ids=optimization_ids,
                 speedups=speedups_list,
                 trace_id=self.function_trace_id[:-4] + exp_type if self.experiment_id else self.function_trace_id,
+                function_references=function_references,
             )
             concurrent.futures.wait([future_ranking])
             ranking = future_ranking.result()
@@ -766,7 +778,7 @@ def determine_best_candidate(
                 min_key = min(overall_ranking, key=overall_ranking.get)
         elif len(optimization_ids) == 1:
             min_key = 0  # only one candidate in valid _opts, already returns if there are no valid candidates
-        else:  # 0? shouldn't happen but it's there to escape potential bugs
+        else:  # 0? shouldn't happen, but it's there to escape potential bugs
             return None
         best_optimization = valid_candidates_with_shorter_code[min_key]
         # reassign code string which is the shortest
@@ -790,6 +802,7 @@ def refine_optimizations(
         trace_id: str,
         ai_service_client: AiServiceClient,
         executor: concurrent.futures.ThreadPoolExecutor,
+        function_references: str | None = None,
     ) -> concurrent.futures.Future:
         request = [
             AIServiceRefinerRequest(
@@ -804,6 +817,7 @@ def refine_optimizations(
                 trace_id=trace_id,
                 original_line_profiler_results=original_code_baseline.line_profile_results["str_out"],
                 optimized_line_profiler_results=opt.line_profiler_test_results["str_out"],
+                function_references=function_references,
             )
             for opt in valid_optimizations
         ]
@@ -1089,7 +1103,7 @@ def generate_tests_and_optimizations(
         generated_test_paths: list[Path],
         generated_perf_test_paths: list[Path],
         run_experiment: bool = False,  # noqa: FBT001, FBT002
-    ) -> Result[tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet], str]:
+    ) -> Result[tuple[GeneratedTestsList, dict[str, set[FunctionCalledInTest]], OptimizationSet], str, str]:
         n_tests = N_TESTS_TO_GENERATE_EFFECTIVE
         assert len(generated_test_paths) == n_tests
         console.rule()
@@ -1116,7 +1130,15 @@ def generate_tests_and_optimizations(
         future_concolic_tests = self.executor.submit(
             generate_concolic_tests, self.test_cfg, self.args, self.function_to_optimize, self.function_to_optimize_ast
         )
-        futures = [*future_tests, future_optimization_candidates, future_concolic_tests]
+        future_references = self.executor.submit(
+            get_opt_review_metrics,
+            self.function_to_optimize_source_code,
+            self.function_to_optimize.file_path,
+            self.function_to_optimize.qualified_name,
+            self.project_root,
+            self.test_cfg.tests_root,
+        )
+        futures = [*future_tests, future_optimization_candidates, future_concolic_tests, future_references]
         if run_experiment:
             future_candidates_exp = self.executor.submit(
                 self.local_aiservice_client.optimize_python_code,
@@ -1168,7 +1190,7 @@ def generate_tests_and_optimizations(
             logger.warning(f"Failed to generate and instrument tests for {self.function_to_optimize.function_name}")
             return Failure(f"/!\\ NO TESTS GENERATED for {self.function_to_optimize.function_name}")
         function_to_concolic_tests, concolic_test_str = future_concolic_tests.result()
-
+        function_references = future_references.result()
         count_tests = len(tests)
         if concolic_test_str:
             count_tests += 1
@@ -1182,6 +1204,7 @@ def generate_tests_and_optimizations(
             function_to_concolic_tests,
             concolic_test_str,
             OptimizationSet(control=candidates, experiment=candidates_experiment),
+            function_references,
         )
         self.generate_and_instrument_tests_results = result
         return Success(result)
@@ -1263,6 +1286,7 @@ def find_and_process_best_optimization(
         generated_tests: GeneratedTestsList,
         test_functions_to_remove: list[str],
         concolic_test_str: str | None,
+        function_references: str,
     ) -> BestOptimization | None:
         """Find the best optimization candidate and process it with all required steps."""
         best_optimization = None
@@ -1279,6 +1303,7 @@ def find_and_process_best_optimization(
                 original_helper_code=original_helper_code,
                 file_path_to_helper_classes=file_path_to_helper_classes,
                 exp_type=exp_type,
+                function_references=function_references,
             )
             ph(
                 "cli-optimize-function-finished",
@@ -1347,6 +1372,7 @@ def find_and_process_best_optimization(
                     exp_type,
                     original_helper_code,
                     code_context,
+                    function_references,
                 )
         return best_optimization
 
@@ -1364,6 +1390,7 @@ def process_review(
         exp_type: str,
         original_helper_code: dict[Path, str],
         code_context: CodeOptimizationContext,
+        function_references: str,
     ) -> None:
         coverage_message = (
             original_code_baseline.coverage_results.build_message()
@@ -1430,6 +1457,7 @@ def process_review(
             original_throughput=original_throughput_str,
             optimized_throughput=optimized_throughput_str,
             throughput_improvement=throughput_improvement_str,
+            function_references=function_references,
         )
         new_explanation = Explanation(
             raw_explanation_message=new_explanation_raw_str or explanation.raw_explanation_message,
@@ -1466,16 +1494,9 @@ def process_review(
         opt_review_response = ""
         if raise_pr or staging_review:
             data["root_dir"] = git_root_dir()
-            calling_fn_details = get_opt_review_metrics(
-                self.function_to_optimize_source_code,
-                self.function_to_optimize.file_path,
-                self.function_to_optimize.qualified_name,
-                self.project_root,
-                self.test_cfg.tests_root,
-            )
             try:
                 opt_review_response = self.aiservice_client.get_optimization_review(
-                    **data, calling_fn_details=calling_fn_details
+                    **data, calling_fn_details=function_references
                 )
             except Exception as e:
                 logger.debug(f"optimization review response failed, investigate {e}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -94,6 +94,7 @@ tests = [
     "scipy>=1.13.1",
     "torch>=2.8.0",
     "xarray>=2024.7.0",
+    "eval_type_backport"
 ]
 
 [tool.hatch.build.targets.sdist]

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,7 @@ tests = [`
`94`	`94`	`"scipy>=1.13.1",`
`95`	`95`	`"torch>=2.8.0",`
`96`	`96`	`"xarray>=2024.7.0",`
	`97`	`+ "eval_type_backport"`
`97`	`98`	`]`
`98`	`99`
`99`	`100`	`[tool.hatch.build.targets.sdist]`