From 3ee6e90bdd9f6ab210d3ae34b399bd8d25acbd1f Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 03:31:07 -0500 Subject: [PATCH 1/6] divertsity --- codeflash/code_utils/config_consts.py | 28 ++++++++++++++++++++ codeflash/models/models.py | 1 + codeflash/optimization/function_optimizer.py | 27 +++++++++++-------- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 88758455e..aa31d8063 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -32,6 +32,32 @@ MAX_N_CANDIDATES = 5 MAX_N_CANDIDATES_LP = 6 +# Multi-model diversity configuration +# Each tuple is (model_name, num_calls) where each call returns 1 candidate +# Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates +MODEL_DISTRIBUTION: list[tuple[str, int]] = [ + ("gpt-4.1", 3), + ("claude-sonnet-4-5", 2), +] + +# LSP mode: fewer candidates for faster response +MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [ + ("gpt-4.1", 2), + ("claude-sonnet-4-5", 1), +] + +# Line profiler mode: 6 candidates total +MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [ + ("gpt-4.1", 4), + ("claude-sonnet-4-5", 2), +] + +# Line profiler LSP mode +MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [ + ("gpt-4.1", 2), + ("claude-sonnet-4-5", 1), +] + try: from codeflash.lsp.helpers import is_LSP_enabled @@ -43,5 +69,7 @@ N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP) N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME +MODEL_DISTRIBUTION_EFFECTIVE = MODEL_DISTRIBUTION_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION +MODEL_DISTRIBUTION_LP_EFFECTIVE = MODEL_DISTRIBUTION_LP_LSP if _IS_LSP_ENABLED else MODEL_DISTRIBUTION_LP MAX_CONTEXT_LEN_REVIEW = 1000 diff --git a/codeflash/models/models.py b/codeflash/models/models.py index 1db09bc12..4f7553818 100644 --- a/codeflash/models/models.py +++ b/codeflash/models/models.py @@ -464,6 +464,7 @@ class OptimizedCandidate: optimization_id: str source: OptimizedCandidateSource parent_id: str | None = None + model: str | None = None # Which LLM model generated this candidate @dataclass(frozen=True) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 416bdc8df..8776d9c58 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -46,6 +46,8 @@ COVERAGE_THRESHOLD, INDIVIDUAL_TESTCASE_TIMEOUT, MAX_REPAIRS_PER_TRACE, + MODEL_DISTRIBUTION_EFFECTIVE, + MODEL_DISTRIBUTION_LP_EFFECTIVE, N_CANDIDATES_EFFECTIVE, N_CANDIDATES_LP_EFFECTIVE, N_TESTS_TO_GENERATE_EFFECTIVE, @@ -921,18 +923,20 @@ def determine_best_candidate( ai_service_client = self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client assert ai_service_client is not None, "AI service client must be set for optimization" + # Use multi-model approach for line profiler optimization future_line_profile_results = self.executor.submit( - ai_service_client.optimize_python_code_line_profiler, + ai_service_client.optimize_python_code_line_profiler_multi_model, source_code=code_context.read_writable_code.markdown, dependency_code=code_context.read_only_context_code, - trace_id=self.get_trace_id(exp_type), + base_trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], - num_candidates=N_CANDIDATES_LP_EFFECTIVE, + model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE, experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) if self.experiment_id else None, + executor=self.executor, ) processor = CandidateProcessor( @@ -1353,17 +1357,17 @@ def generate_optimizations( read_only_context_code: str, run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: - """Generate optimization candidates for the function.""" - n_candidates = N_CANDIDATES_EFFECTIVE - + """Generate optimization candidates for the function using multiple models in parallel.""" + # Use multi-model approach for diversity future_optimization_candidates = self.executor.submit( - self.aiservice_client.optimize_python_code, + self.aiservice_client.optimize_python_code_multi_model, read_writable_code.markdown, read_only_context_code, self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id, - n_candidates, + MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, + executor=self.executor, ) future_references = self.executor.submit( @@ -1380,13 +1384,14 @@ def generate_optimizations( if run_experiment: future_candidates_exp = self.executor.submit( - self.local_aiservice_client.optimize_python_code, + self.local_aiservice_client.optimize_python_code_multi_model, read_writable_code.markdown, read_only_context_code, self.function_trace_id[:-4] + "EXP1", - n_candidates, + MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, + executor=self.executor, ) futures.append(future_candidates_exp) @@ -1395,7 +1400,7 @@ def generate_optimizations( # Retrieve results candidates: list[OptimizedCandidate] = future_optimization_candidates.result() - logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations.") + logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations from multiple models.") if not candidates: return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}") From 35ae79e8e6f778440bae4bdae5f87c863cdf6acd Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 03:42:32 -0500 Subject: [PATCH 2/6] add diversity --- codeflash/api/aiservice.py | 110 ++++++++++++++++++- codeflash/code_utils/config_consts.py | 20 +--- codeflash/optimization/function_optimizer.py | 5 - 3 files changed, 108 insertions(+), 27 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 86fb125b7..78d042791 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -1,5 +1,6 @@ from __future__ import annotations +import concurrent.futures import json import os import platform @@ -12,7 +13,6 @@ from codeflash.cli_cmds.console import console, logger from codeflash.code_utils.code_replacer import is_zero_diff from codeflash.code_utils.code_utils import unified_diff_strings -from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE, N_CANDIDATES_LP_EFFECTIVE from codeflash.code_utils.env_utils import get_codeflash_api_key from codeflash.code_utils.git_utils import get_last_commit_author_if_pr_exists, get_repo_owner_and_name from codeflash.code_utils.time_utils import humanize_runtime @@ -35,6 +35,8 @@ from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest from codeflash.result.explanation import Explanation +multi_model_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix="multi_model") + class AiServiceClient: def __init__(self) -> None: @@ -92,7 +94,7 @@ def make_ai_service_request( return response def _get_valid_candidates( - self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource + self, optimizations_json: list[dict[str, Any]], source: OptimizedCandidateSource, model: str | None = None ) -> list[OptimizedCandidate]: candidates: list[OptimizedCandidate] = [] for opt in optimizations_json: @@ -106,6 +108,7 @@ def _get_valid_candidates( optimization_id=opt["optimization_id"], source=source, parent_id=opt.get("parent_id", None), + model=model, ) ) return candidates @@ -119,6 +122,7 @@ def optimize_python_code( # noqa: D417 experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, + model: str | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -129,6 +133,7 @@ def optimize_python_code( # noqa: D417 - trace_id (str): Trace id of optimization run - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization + - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). Returns ------- @@ -149,8 +154,9 @@ def optimize_python_code( # noqa: D417 "current_username": get_last_commit_author_if_pr_exists(None), "repo_owner": git_repo_owner, "repo_name": git_repo_name, - "n_candidates": N_CANDIDATES_EFFECTIVE, + "n_candidates": num_candidates, "is_async": is_async, + "model": model, } logger.info("!lsp|Generating optimized candidates…") @@ -167,7 +173,7 @@ def optimize_python_code( # noqa: D417 console.rule() end_time = time.perf_counter() logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.") - return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE) + return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model) try: error = response.json()["error"] except Exception: @@ -185,6 +191,7 @@ def optimize_python_code_line_profiler( # noqa: D417 line_profiler_results: str, num_candidates: int = 10, experiment_metadata: ExperimentMetadata | None = None, + model: str | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -195,6 +202,7 @@ def optimize_python_code_line_profiler( # noqa: D417 - trace_id (str): Trace id of optimization run - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization + - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). Returns ------- @@ -211,7 +219,8 @@ def optimize_python_code_line_profiler( # noqa: D417 "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), - "n_candidates_lp": N_CANDIDATES_LP_EFFECTIVE, + "n_candidates_lp": num_candidates, + "model": model, } console.rule() @@ -232,7 +241,7 @@ def optimize_python_code_line_profiler( # noqa: D417 f"!lsp|Generated {len(optimizations_json)} candidate optimizations using line profiler information." ) console.rule() - return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP) + return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model) try: error = response.json()["error"] except Exception: @@ -242,6 +251,95 @@ def optimize_python_code_line_profiler( # noqa: D417 console.rule() return [] + def optimize_python_code_multi_model( + self, + source_code: str, + dependency_code: str, + base_trace_id: str, + model_distribution: list[tuple[str, int]], + experiment_metadata: ExperimentMetadata | None = None, + *, + is_async: bool = False, + ) -> list[OptimizedCandidate]: + """Generate optimizations using multiple models in parallel.""" + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] + call_index = 0 + + for model_name, num_calls in model_distribution: + for _ in range(num_calls): + call_trace_id = f"{base_trace_id[:-4]}M{call_index:02d}" + call_index += 1 + + future = multi_model_executor.submit( + self.optimize_python_code, + source_code, + dependency_code, + call_trace_id, + num_candidates=1, # Each call returns 1 candidate + experiment_metadata=experiment_metadata, + is_async=is_async, + model=model_name, + ) + futures.append((future, model_name)) + + # Wait for all calls to complete + concurrent.futures.wait([f for f, _ in futures]) + + # Collect results + all_candidates: list[OptimizedCandidate] = [] + for future, model_name in futures: + try: + candidates = future.result() + all_candidates.extend(candidates) + except Exception as e: + logger.warning(f"Model {model_name} call failed: {e}") + continue + + return all_candidates + + def optimize_python_code_line_profiler_multi_model( + self, + source_code: str, + dependency_code: str, + base_trace_id: str, + line_profiler_results: str, + model_distribution: list[tuple[str, int]], + experiment_metadata: ExperimentMetadata | None = None, + ) -> list[OptimizedCandidate]: + """Generate line profiler optimizations using multiple models in parallel.""" + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] + call_index = 0 + + for model_name, num_calls in model_distribution: + for _ in range(num_calls): + call_trace_id = f"{base_trace_id[:-4]}L{call_index:02d}" + call_index += 1 + + future = multi_model_executor.submit( + self.optimize_python_code_line_profiler, + source_code, + dependency_code, + call_trace_id, + line_profiler_results, + num_candidates=1, + experiment_metadata=experiment_metadata, + model=model_name, + ) + futures.append((future, model_name)) + + concurrent.futures.wait([f for f, _ in futures]) + + all_candidates: list[OptimizedCandidate] = [] + for future, model_name in futures: + try: + candidates = future.result() + all_candidates.extend(candidates) + except Exception as e: + logger.warning(f"Line profiler model {model_name} call failed: {e}") + continue + + return all_candidates + def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index aa31d8063..ba09989f8 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -35,28 +35,16 @@ # Multi-model diversity configuration # Each tuple is (model_name, num_calls) where each call returns 1 candidate # Standard mode: 3 GPT-4.1 + 2 Claude Sonnet = 5 candidates -MODEL_DISTRIBUTION: list[tuple[str, int]] = [ - ("gpt-4.1", 3), - ("claude-sonnet-4-5", 2), -] +MODEL_DISTRIBUTION: list[tuple[str, int]] = [("gpt-4.1", 3), ("claude-sonnet-4-5", 2)] # LSP mode: fewer candidates for faster response -MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [ - ("gpt-4.1", 2), - ("claude-sonnet-4-5", 1), -] +MODEL_DISTRIBUTION_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)] # Line profiler mode: 6 candidates total -MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [ - ("gpt-4.1", 4), - ("claude-sonnet-4-5", 2), -] +MODEL_DISTRIBUTION_LP: list[tuple[str, int]] = [("gpt-4.1", 4), ("claude-sonnet-4-5", 2)] # Line profiler LSP mode -MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [ - ("gpt-4.1", 2), - ("claude-sonnet-4-5", 1), -] +MODEL_DISTRIBUTION_LP_LSP: list[tuple[str, int]] = [("gpt-4.1", 2), ("claude-sonnet-4-5", 1)] try: from codeflash.lsp.helpers import is_LSP_enabled diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 8776d9c58..afd56519e 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -48,8 +48,6 @@ MAX_REPAIRS_PER_TRACE, MODEL_DISTRIBUTION_EFFECTIVE, MODEL_DISTRIBUTION_LP_EFFECTIVE, - N_CANDIDATES_EFFECTIVE, - N_CANDIDATES_LP_EFFECTIVE, N_TESTS_TO_GENERATE_EFFECTIVE, REFINE_ALL_THRESHOLD, REFINED_CANDIDATE_RANKING_WEIGHTS, @@ -936,7 +934,6 @@ def determine_best_candidate( ) if self.experiment_id else None, - executor=self.executor, ) processor = CandidateProcessor( @@ -1367,7 +1364,6 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, - executor=self.executor, ) future_references = self.executor.submit( @@ -1391,7 +1387,6 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, - executor=self.executor, ) futures.append(future_candidates_exp) From cdf85d2c8be74d37a2352b33906b564cfd7fc123 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 03:56:02 -0500 Subject: [PATCH 3/6] implement trace_id observability --- codeflash/api/aiservice.py | 77 +++++++++++--------- codeflash/models/models.py | 1 + codeflash/optimization/function_optimizer.py | 72 +++++++++++++++--- codeflash/verification/verifier.py | 2 + 4 files changed, 105 insertions(+), 47 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 78d042791..4dca8096c 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -118,11 +118,11 @@ def optimize_python_code( # noqa: D417 source_code: str, dependency_code: str, trace_id: str, - num_candidates: int = 10, experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, model: str | None = None, + call_sequence: int | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -131,9 +131,9 @@ def optimize_python_code( # noqa: D417 - source_code (str): The python code to optimize. - dependency_code (str): The dependency code used as read-only context for the optimization - trace_id (str): Trace id of optimization run - - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). + - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None. Returns ------- @@ -146,7 +146,6 @@ def optimize_python_code( # noqa: D417 payload = { "source_code": source_code, "dependency_code": dependency_code, - "num_variants": num_candidates, "trace_id": trace_id, "python_version": platform.python_version(), "experiment_metadata": experiment_metadata, @@ -154,13 +153,12 @@ def optimize_python_code( # noqa: D417 "current_username": get_last_commit_author_if_pr_exists(None), "repo_owner": git_repo_owner, "repo_name": git_repo_name, - "n_candidates": num_candidates, "is_async": is_async, "model": model, + "call_sequence": call_sequence, } + logger.debug(f"Sending optimize request: model={model}, trace_id={trace_id}, call_sequence={call_sequence}") - logger.info("!lsp|Generating optimized candidates…") - console.rule() try: response = self.make_ai_service_request("/optimize", payload=payload, timeout=60) except requests.exceptions.RequestException as e: @@ -170,9 +168,9 @@ def optimize_python_code( # noqa: D417 if response.status_code == 200: optimizations_json = response.json()["optimizations"] - console.rule() end_time = time.perf_counter() logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.") + logger.debug(f"Backend returned {len(optimizations_json)} optimization(s)") return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE, model=model) try: error = response.json()["error"] @@ -180,7 +178,6 @@ def optimize_python_code( # noqa: D417 error = response.text logger.error(f"Error generating optimized candidates: {response.status_code} - {error}") ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error}) - console.rule() return [] def optimize_python_code_line_profiler( # noqa: D417 @@ -189,9 +186,9 @@ def optimize_python_code_line_profiler( # noqa: D417 dependency_code: str, trace_id: str, line_profiler_results: str, - num_candidates: int = 10, experiment_metadata: ExperimentMetadata | None = None, model: str | None = None, + call_sequence: int | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -200,9 +197,9 @@ def optimize_python_code_line_profiler( # noqa: D417 - source_code (str): The python code to optimize. - dependency_code (str): The dependency code used as read-only context for the optimization - trace_id (str): Trace id of optimization run - - num_candidates (int): Number of optimization variants to generate. Default is 10. - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization - model (str | None): Model name to use ("gpt-4.1" or "claude-sonnet-4-5"). Default is None (server default). + - call_sequence (int | None): Sequence number for multi-model calls (1, 2, 3...). Default is None. Returns ------- @@ -212,21 +209,18 @@ def optimize_python_code_line_profiler( # noqa: D417 payload = { "source_code": source_code, "dependency_code": dependency_code, - "num_variants": num_candidates, "line_profiler_results": line_profiler_results, "trace_id": trace_id, "python_version": platform.python_version(), "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), - "n_candidates_lp": num_candidates, "model": model, + "call_sequence": call_sequence, } - console.rule() if line_profiler_results == "": logger.info("No LineProfiler results were provided, Skipping optimization.") - console.rule() return [] try: response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60) @@ -237,10 +231,7 @@ def optimize_python_code_line_profiler( # noqa: D417 if response.status_code == 200: optimizations_json = response.json()["optimizations"] - logger.info( - f"!lsp|Generated {len(optimizations_json)} candidate optimizations using line profiler information." - ) - console.rule() + logger.debug(f"Backend returned {len(optimizations_json)} LP optimization(s)") return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP, model=model) try: error = response.json()["error"] @@ -248,7 +239,6 @@ def optimize_python_code_line_profiler( # noqa: D417 error = response.text logger.error(f"Error generating optimized candidates: {response.status_code} - {error}") ph("cli-optimize-error-response", {"response_status_code": response.status_code, "error": error}) - console.rule() return [] def optimize_python_code_multi_model( @@ -260,32 +250,34 @@ def optimize_python_code_multi_model( experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, - ) -> list[OptimizedCandidate]: + sequence_offset: int = 0, + ) -> tuple[list[OptimizedCandidate], int]: """Generate optimizations using multiple models in parallel.""" + logger.info("Generating optimized candidates…") + console.rule() + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] - call_index = 0 + call_index = 0 for model_name, num_calls in model_distribution: for _ in range(num_calls): - call_trace_id = f"{base_trace_id[:-4]}M{call_index:02d}" + call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}" + call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( self.optimize_python_code, source_code, dependency_code, call_trace_id, - num_candidates=1, # Each call returns 1 candidate - experiment_metadata=experiment_metadata, + experiment_metadata, is_async=is_async, model=model_name, + call_sequence=call_sequence, ) futures.append((future, model_name)) - # Wait for all calls to complete concurrent.futures.wait([f for f, _ in futures]) - # Collect results all_candidates: list[OptimizedCandidate] = [] for future, model_name in futures: try: @@ -295,7 +287,8 @@ def optimize_python_code_multi_model( logger.warning(f"Model {model_name} call failed: {e}") continue - return all_candidates + console.rule() + return all_candidates, call_index def optimize_python_code_line_profiler_multi_model( self, @@ -305,25 +298,29 @@ def optimize_python_code_line_profiler_multi_model( line_profiler_results: str, model_distribution: list[tuple[str, int]], experiment_metadata: ExperimentMetadata | None = None, - ) -> list[OptimizedCandidate]: + sequence_offset: int = 0, + ) -> tuple[list[OptimizedCandidate], int]: """Generate line profiler optimizations using multiple models in parallel.""" + logger.info("Generating optimized candidates with line profiler…") + console.rule() + futures: list[tuple[concurrent.futures.Future[list[OptimizedCandidate]], str]] = [] - call_index = 0 + call_index = 0 for model_name, num_calls in model_distribution: for _ in range(num_calls): - call_trace_id = f"{base_trace_id[:-4]}L{call_index:02d}" + call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}" + call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( self.optimize_python_code_line_profiler, source_code, dependency_code, call_trace_id, line_profiler_results, - num_candidates=1, - experiment_metadata=experiment_metadata, - model=model_name, + experiment_metadata, + model_name, + call_sequence, ) futures.append((future, model_name)) @@ -338,7 +335,8 @@ def optimize_python_code_line_profiler_multi_model( logger.warning(f"Line profiler model {model_name} call failed: {e}") continue - return all_candidates + console.rule() + return all_candidates, call_index def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -366,6 +364,7 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest] "trace_id": opt.trace_id, "function_references": opt.function_references, "python_version": platform.python_version(), + "call_sequence": opt.call_sequence, } for opt in request ] @@ -455,6 +454,7 @@ def get_new_explanation( # noqa: D417 throughput_improvement: str | None = None, function_references: str | None = None, codeflash_version: str = codeflash_version, + call_sequence: int | None = None, ) -> str: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -500,6 +500,7 @@ def get_new_explanation( # noqa: D417 "throughput_improvement": throughput_improvement, "function_references": function_references, "codeflash_version": codeflash_version, + "call_sequence": call_sequence, } logger.info("loading|Generating explanation") console.rule() @@ -627,6 +628,7 @@ def generate_regression_tests( # noqa: D417 test_timeout: int, trace_id: str, test_index: int, + call_sequence: int | None = None, ) -> tuple[str, str, str] | None: """Generate regression tests for the given function by making a request to the Django endpoint. @@ -662,6 +664,7 @@ def generate_regression_tests( # noqa: D417 "python_version": platform.python_version(), "codeflash_version": codeflash_version, "is_async": function_to_optimize.is_async, + "call_sequence": call_sequence, } try: response = self.make_ai_service_request("/testgen", payload=payload, timeout=90) @@ -702,6 +705,7 @@ def get_optimization_review( replay_tests: str, concolic_tests: str, # noqa: ARG002 calling_fn_details: str, + call_sequence: int | None = None, ) -> str: """Compute the optimization review of current Pull Request. @@ -748,6 +752,7 @@ def get_optimization_review( "codeflash_version": codeflash_version, "calling_fn_details": calling_fn_details, "python_version": platform.python_version(), + "call_sequence": call_sequence, } console.rule() try: diff --git a/codeflash/models/models.py b/codeflash/models/models.py index 4f7553818..822ecffab 100644 --- a/codeflash/models/models.py +++ b/codeflash/models/models.py @@ -46,6 +46,7 @@ class AIServiceRefinerRequest: original_line_profiler_results: str optimized_line_profiler_results: str function_references: str | None = None + call_sequence: int | None = None class TestDiffScope(str, Enum): diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index afd56519e..e8e51deb7 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -139,6 +139,7 @@ def __init__( ai_service_client: AiServiceClient, executor: concurrent.futures.ThreadPoolExecutor, future_all_code_repair: list[concurrent.futures.Future], + sequence_offset: int = 0, ) -> None: self.candidate_queue = queue.Queue() self.line_profiler_done = False @@ -146,6 +147,9 @@ def __init__( self.candidate_len = len(initial_candidates) self.ai_service_client = ai_service_client self.executor = executor + self.sequence_offset = sequence_offset + self.lp_calls_count = 0 + self.refinement_calls_count = 0 # Initialize queue with initial candidates for candidate in initial_candidates: @@ -155,6 +159,9 @@ def __init__( self.all_refinements_data = all_refinements_data self.future_all_code_repair = future_all_code_repair + def get_total_llm_calls(self) -> int: + return self.sequence_offset + self.lp_calls_count + self.refinement_calls_count + def get_next_candidate(self) -> OptimizedCandidate | None: """Get the next candidate from the queue, handling async results as needed.""" try: @@ -176,7 +183,11 @@ def _process_line_profiler_results(self) -> OptimizedCandidate | None: """Process line profiler results and add to queue.""" logger.debug("all candidates processed, await candidates from line profiler") concurrent.futures.wait([self.future_line_profile_results]) - line_profile_results = self.future_line_profile_results.result() + result = self.future_line_profile_results.result() + + # LP multi-model now returns (candidates, lp_call_count) + line_profile_results, lp_call_count = result + self.lp_calls_count = lp_call_count for candidate in line_profile_results: self.candidate_queue.put(candidate) @@ -192,11 +203,18 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur def _process_refinement_results(self) -> OptimizedCandidate | None: """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined.""" + import dataclasses # noqa: PLC0415 + future_refinements: list[concurrent.futures.Future] = [] + # Calculate base sequence: offset + lp_calls (refinements come after LP) + base_sequence = self.sequence_offset + self.lp_calls_count + refinement_call_index = 0 if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD: for data in self.all_refinements_data: - future_refinements.append(self.refine_optimizations([data])) # noqa: PERF401 + refinement_call_index += 1 + data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index) + future_refinements.append(self.refine_optimizations([data_with_seq])) else: diff_lens_list = [] runtimes_list = [] @@ -215,8 +233,13 @@ def _process_refinement_results(self) -> OptimizedCandidate | None: top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates] for idx in top_indecies: + refinement_call_index += 1 data = self.all_refinements_data[idx] - future_refinements.append(self.refine_optimizations([data])) + data_with_seq = dataclasses.replace(data, call_sequence=base_sequence + refinement_call_index) + future_refinements.append(self.refine_optimizations([data_with_seq])) + + # Track total refinement calls made + self.refinement_calls_count = refinement_call_index if future_refinements: logger.info("loading|Refining generated code for improved quality and performance...") @@ -319,10 +342,14 @@ def __init__( self.optimization_review = "" self.future_all_code_repair: list[concurrent.futures.Future] = [] self.repair_counter = 0 # track how many repairs we did for each function + self.test_gen_calls_count = 0 + self.optimize_calls_count = 0 + self.lp_calls_count = 0 + self.total_llm_calls = 0 def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]: should_run_experiment = self.experiment_id is not None - logger.debug(f"Function Trace ID: {self.function_trace_id}") + logger.info(f"Function Trace ID: {self.function_trace_id}") ph("cli-optimize-function-start", {"function_trace_id": self.function_trace_id}) self.cleanup_leftover_test_return_values() file_name_from_test_module_name.cache_clear() @@ -921,7 +948,6 @@ def determine_best_candidate( ai_service_client = self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client assert ai_service_client is not None, "AI service client must be set for optimization" - # Use multi-model approach for line profiler optimization future_line_profile_results = self.executor.submit( ai_service_client.optimize_python_code_line_profiler_multi_model, source_code=code_context.read_writable_code.markdown, @@ -934,6 +960,7 @@ def determine_best_candidate( ) if self.experiment_id else None, + sequence_offset=self.optimize_calls_count, ) processor = CandidateProcessor( @@ -943,6 +970,7 @@ def determine_best_candidate( self.aiservice_client, self.executor, self.future_all_code_repair, + sequence_offset=self.optimize_calls_count, ) candidate_index = 0 @@ -976,6 +1004,9 @@ def determine_best_candidate( self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path ) + # Track total LLM calls from the processor for sequence numbering + self.total_llm_calls = processor.get_total_llm_calls() + # Select and return the best optimization best_optimization = self.select_best_optimization( eval_ctx=eval_ctx, @@ -1355,7 +1386,6 @@ def generate_optimizations( run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: """Generate optimization candidates for the function using multiple models in parallel.""" - # Use multi-model approach for diversity future_optimization_candidates = self.executor.submit( self.aiservice_client.optimize_python_code_multi_model, read_writable_code.markdown, @@ -1364,6 +1394,7 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, + sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, ) future_references = self.executor.submit( @@ -1387,20 +1418,26 @@ def generate_optimizations( MODEL_DISTRIBUTION_EFFECTIVE, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, + sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, ) futures.append(future_candidates_exp) # Wait for optimization futures to complete concurrent.futures.wait(futures) - # Retrieve results - candidates: list[OptimizedCandidate] = future_optimization_candidates.result() - logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations from multiple models.") + # Retrieve results - optimize_python_code_multi_model returns (candidates, call_count) + candidates, optimize_call_count = future_optimization_candidates.result() + # Total sequence count = test gen calls + optimization calls (LP will continue from here) + self.optimize_calls_count = N_TESTS_TO_GENERATE_EFFECTIVE + optimize_call_count + logger.info(f"!lsp|Completed {optimize_call_count} optimization calls, got {len(candidates)} candidates.") if not candidates: return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}") - candidates_experiment = future_candidates_exp.result() if future_candidates_exp else None + # Handle experiment results - also returns (candidates, call_count) tuple + candidates_experiment = None + if future_candidates_exp: + candidates_experiment, _ = future_candidates_exp.result() function_references = future_references.result() return Success((OptimizationSet(control=candidates, experiment=candidates_experiment), function_references)) @@ -1647,6 +1684,10 @@ def process_review( ) throughput_improvement_str = f"{throughput_improvement_value * 100:.1f}%" + # Explanation call continues the sequence numbering + explanation_call_sequence = self.total_llm_calls + 1 + self.total_llm_calls = explanation_call_sequence + new_explanation_raw_str = self.aiservice_client.get_new_explanation( source_code=code_context.read_writable_code.flat, dependency_code=code_context.read_only_context_code, @@ -1664,6 +1705,7 @@ def process_review( optimized_throughput=optimized_throughput_str, throughput_improvement=throughput_improvement_str, function_references=function_references, + call_sequence=explanation_call_sequence, ) new_explanation = Explanation( raw_explanation_message=new_explanation_raw_str or explanation.raw_explanation_message, @@ -1699,9 +1741,13 @@ def process_review( staging_review = self.args.staging_review opt_review_response = "" # this will now run regardless of pr, staging review flags + # Optimization review call continues the sequence numbering + review_call_sequence = self.total_llm_calls + 1 + self.total_llm_calls = review_call_sequence + try: opt_review_response = self.aiservice_client.get_optimization_review( - **data, calling_fn_details=function_references + **data, calling_fn_details=function_references, call_sequence=review_call_sequence ) except Exception as e: logger.debug(f"optimization review response failed, investigate {e}") @@ -2192,6 +2238,9 @@ def submit_test_generation_tasks( generated_test_paths: list[Path], generated_perf_test_paths: list[Path], ) -> list[concurrent.futures.Future]: + # Track how many test generation calls we're making for sequence numbering + self.test_gen_calls_count = len(generated_test_paths) + return [ executor.submit( generate_tests, @@ -2206,6 +2255,7 @@ def submit_test_generation_tasks( test_index, test_path, test_perf_path, + call_sequence=test_index + 1, ) for test_index, (test_path, test_perf_path) in enumerate( zip(generated_test_paths, generated_perf_test_paths) diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py index 8d187f2b1..d94455df3 100644 --- a/codeflash/verification/verifier.py +++ b/codeflash/verification/verifier.py @@ -27,6 +27,7 @@ def generate_tests( test_index: int, test_path: Path, test_perf_path: Path, + call_sequence: int | None = None, ) -> tuple[str, str, Path] | None: # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original # class import. Remove the recreation of the class definition @@ -42,6 +43,7 @@ def generate_tests( test_timeout=test_timeout, trace_id=function_trace_id, test_index=test_index, + call_sequence=call_sequence, ) if response and isinstance(response, tuple) and len(response) == 3: generated_test_source, instrumented_behavior_test_source, instrumented_perf_test_source = response From 5a122a99fb44f6b7a82c95afba9c5a8c716d0b04 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 18:34:37 -0500 Subject: [PATCH 4/6] pre-commit changes --- codeflash/api/aiservice.py | 1 + codeflash/optimization/function_optimizer.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index e233e6a71..fff3611fd 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -721,6 +721,7 @@ def get_optimization_review( root_dir: Path -> path of git directory concolic_tests: str -> concolic_tests (not used) calling_fn_details: str -> filenames and definitions of functions which call the function_to_optimize + call_sequence: int | None -> sequence number for multi-model calls Returns: ------- diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index e8e51deb7..dfafb86a5 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -203,7 +203,7 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur def _process_refinement_results(self) -> OptimizedCandidate | None: """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined.""" - import dataclasses # noqa: PLC0415 + import dataclasses future_refinements: list[concurrent.futures.Future] = [] # Calculate base sequence: offset + lp_calls (refinements come after LP) From 1b6e046553ba97c093a13ae59c862d000e685a13 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 18:56:10 -0500 Subject: [PATCH 5/6] have the client manage the TPE --- codeflash/api/aiservice.py | 9 ++++----- codeflash/optimization/function_optimizer.py | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index fff3611fd..876ee74c5 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -35,9 +35,6 @@ from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest from codeflash.result.explanation import Explanation -multi_model_executor = concurrent.futures.ThreadPoolExecutor(max_workers=10, thread_name_prefix="multi_model") - - class AiServiceClient: def __init__(self) -> None: self.base_url = self.get_aiservice_base_url() @@ -251,6 +248,7 @@ def optimize_python_code_multi_model( *, is_async: bool = False, sequence_offset: int = 0, + executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates…") @@ -264,7 +262,7 @@ def optimize_python_code_multi_model( call_trace_id = f"{base_trace_id[:-3]}0{call_index:02x}" call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( + future = executor.submit( self.optimize_python_code, source_code, dependency_code, @@ -299,6 +297,7 @@ def optimize_python_code_line_profiler_multi_model( model_distribution: list[tuple[str, int]], experiment_metadata: ExperimentMetadata | None = None, sequence_offset: int = 0, + executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate line profiler optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates with line profiler…") @@ -312,7 +311,7 @@ def optimize_python_code_line_profiler_multi_model( call_trace_id = f"{base_trace_id[:-3]}1{call_index:02x}" call_sequence = sequence_offset + call_index + 1 call_index += 1 - future = multi_model_executor.submit( + future = executor.submit( self.optimize_python_code_line_profiler, source_code, dependency_code, diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index dfafb86a5..b29586b96 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -961,6 +961,7 @@ def determine_best_candidate( if self.experiment_id else None, sequence_offset=self.optimize_calls_count, + executor=self.executor, ) processor = CandidateProcessor( @@ -1395,6 +1396,7 @@ def generate_optimizations( ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, + executor=self.executor, ) future_references = self.executor.submit( @@ -1419,6 +1421,7 @@ def generate_optimizations( ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, + executor=self.executor, ) futures.append(future_candidates_exp) From 1c6e9513faa31df7ef57adcb0383fac03819019e Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 23 Dec 2025 18:58:42 -0500 Subject: [PATCH 6/6] we should always have an executor --- codeflash/api/aiservice.py | 5 +++-- codeflash/optimization/function_optimizer.py | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 876ee74c5..7480252bd 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -35,6 +35,7 @@ from codeflash.models.models import AIServiceCodeRepairRequest, AIServiceRefinerRequest from codeflash.result.explanation import Explanation + class AiServiceClient: def __init__(self) -> None: self.base_url = self.get_aiservice_base_url() @@ -244,11 +245,11 @@ def optimize_python_code_multi_model( dependency_code: str, base_trace_id: str, model_distribution: list[tuple[str, int]], + executor: concurrent.futures.ThreadPoolExecutor, experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, sequence_offset: int = 0, - executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates…") @@ -295,9 +296,9 @@ def optimize_python_code_line_profiler_multi_model( base_trace_id: str, line_profiler_results: str, model_distribution: list[tuple[str, int]], + executor: concurrent.futures.ThreadPoolExecutor, experiment_metadata: ExperimentMetadata | None = None, sequence_offset: int = 0, - executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> tuple[list[OptimizedCandidate], int]: """Generate line profiler optimizations using multiple models in parallel.""" logger.info("Generating optimized candidates with line profiler…") diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index b29586b96..6228ee01a 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -955,13 +955,13 @@ def determine_best_candidate( base_trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], model_distribution=MODEL_DISTRIBUTION_LP_EFFECTIVE, + executor=self.executor, experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) if self.experiment_id else None, sequence_offset=self.optimize_calls_count, - executor=self.executor, ) processor = CandidateProcessor( @@ -1393,10 +1393,10 @@ def generate_optimizations( read_only_context_code, self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id, MODEL_DISTRIBUTION_EFFECTIVE, + self.executor, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, - executor=self.executor, ) future_references = self.executor.submit( @@ -1418,10 +1418,10 @@ def generate_optimizations( read_only_context_code, self.function_trace_id[:-4] + "EXP1", MODEL_DISTRIBUTION_EFFECTIVE, + self.executor, ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, sequence_offset=N_TESTS_TO_GENERATE_EFFECTIVE, - executor=self.executor, ) futures.append(future_candidates_exp)