Add a first version of hashing code context

misrasaurabh1 · misrasaurabh1 · commit 5c0a028dc267 · 2025-06-07T17:39:24.000-07:00
diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py
@@ -73,6 +73,13 @@ def get_code_optimization_context(
         remove_docstrings=False,
         code_context_type=CodeContextType.READ_ONLY,
     )
+    hashing_code_context = extract_code_markdown_context_from_files(
+        helpers_of_fto_dict,
+        helpers_of_helpers_dict,
+        project_root_path,
+        remove_docstrings=True,
+        code_context_type=CodeContextType.HASHING,
+    )
 
     # Handle token limits
     final_read_writable_tokens = encoded_tokens_len(final_read_writable_code)
@@ -130,6 +137,7 @@ def get_code_optimization_context(
         testgen_context_code=testgen_context_code,
         read_writable_code=final_read_writable_code,
         read_only_context_code=read_only_context_code,
+        hashing_code_context=hashing_code_context.markdown,
         helper_functions=helpers_of_fto_list,
         preexisting_objects=preexisting_objects,
     )
@@ -309,20 +317,21 @@ def extract_code_markdown_context_from_files(
             logger.debug(f"Error while getting read-only code: {e}")
             continue
         if code_context.strip():
-            code_context_with_imports = CodeString(
-                code=add_needed_imports_from_module(
-                    src_module_code=original_code,
-                    dst_module_code=code_context,
-                    src_path=file_path,
-                    dst_path=file_path,
-                    project_root=project_root_path,
-                    helper_functions=list(
-                        helpers_of_fto.get(file_path, set()) | helpers_of_helpers.get(file_path, set())
+            if code_context_type != CodeContextType.HASHING:
+                code_context = (
+                    add_needed_imports_from_module(
+                        src_module_code=original_code,
+                        dst_module_code=code_context,
+                        src_path=file_path,
+                        dst_path=file_path,
+                        project_root=project_root_path,
+                        helper_functions=list(
+                            helpers_of_fto.get(file_path, set()) | helpers_of_helpers.get(file_path, set())
+                        ),
                     ),
-                ),
-                file_path=file_path.relative_to(project_root_path),
-            )
-            code_context_markdown.code_strings.append(code_context_with_imports)
+                )
+            code_string_context = CodeString(code=code_context, file_path=file_path.relative_to(project_root_path))
+            code_context_markdown.code_strings.append(code_string_context)
     # Extract code from file paths containing helpers of helpers
     for file_path, helper_function_sources in helpers_of_helpers_no_overlap.items():
         try:
@@ -343,18 +352,19 @@ def extract_code_markdown_context_from_files(
             continue
 
         if code_context.strip():
-            code_context_with_imports = CodeString(
-                code=add_needed_imports_from_module(
-                    src_module_code=original_code,
-                    dst_module_code=code_context,
-                    src_path=file_path,
-                    dst_path=file_path,
-                    project_root=project_root_path,
-                    helper_functions=list(helpers_of_helpers_no_overlap.get(file_path, set())),
-                ),
-                file_path=file_path.relative_to(project_root_path),
-            )
-            code_context_markdown.code_strings.append(code_context_with_imports)
+            if code_context_type != CodeContextType.HASHING:
+                code_context = (
+                    add_needed_imports_from_module(
+                        src_module_code=original_code,
+                        dst_module_code=code_context,
+                        src_path=file_path,
+                        dst_path=file_path,
+                        project_root=project_root_path,
+                        helper_functions=list(helpers_of_helpers_no_overlap.get(file_path, set())),
+                    ),
+                )
+            code_string_context = CodeString(code=code_context, file_path=file_path.relative_to(project_root_path))
+            code_context_markdown.code_strings.append(code_string_context)
     return code_context_markdown
 
 
@@ -492,6 +502,8 @@ def parse_code_and_prune_cst(
         filtered_node, found_target = prune_cst_for_testgen_code(
             module, target_functions, helpers_of_helper_functions, remove_docstrings=remove_docstrings
         )
+    elif code_context_type == CodeContextType.HASHING:
+        filtered_node, found_target = prune_cst_for_code_hashing(module, target_functions)
     else:
         raise ValueError(f"Unknown code_context_type: {code_context_type}")  # noqa: EM102
 
@@ -583,6 +595,87 @@ def prune_cst_for_read_writable_code(  # noqa: PLR0911
     return (node.with_changes(**updates) if updates else node), True
 
 
+def prune_cst_for_code_hashing(  # noqa: PLR0911
+    node: cst.CSTNode, target_functions: set[str], prefix: str = ""
+) -> tuple[cst.CSTNode | None, bool]:
+    """Recursively filter the node and its children to build the read-writable codeblock. This contains nodes that lead to target functions.
+
+    Returns
+    -------
+        (filtered_node, found_target):
+          filtered_node: The modified CST node or None if it should be removed.
+          found_target: True if a target function was found in this node's subtree.
+
+    """
+    if isinstance(node, (cst.Import, cst.ImportFrom)):
+        return None, False
+
+    if isinstance(node, cst.FunctionDef):
+        qualified_name = f"{prefix}.{node.name.value}" if prefix else node.name.value
+        if qualified_name in target_functions:
+            new_body = remove_docstring_from_body(node.body)
+            return node.with_changes(body=new_body), True
+        return None, False
+
+    if isinstance(node, cst.ClassDef):
+        # Do not recurse into nested classes
+        if prefix:
+            return None, False
+        # Assuming always an IndentedBlock
+        if not isinstance(node.body, cst.IndentedBlock):
+            raise ValueError("ClassDef body is not an IndentedBlock")  # noqa: TRY004
+        class_prefix = f"{prefix}.{node.name.value}" if prefix else node.name.value
+        new_body = []
+        found_target = False
+
+        for stmt in node.body.body:
+            if isinstance(stmt, cst.FunctionDef):
+                qualified_name = f"{class_prefix}.{stmt.name.value}"
+                if qualified_name in target_functions:
+                    new_body.append(stmt)
+                    found_target = True
+        # If no target functions found, remove the class entirely
+        if not new_body or not found_target:
+            return None, False
+        return node.with_changes(
+            body=remove_docstring_from_body(node.body.with_changes(body=new_body))
+        ) if new_body else None, True
+
+    # For other nodes, we preserve them only if they contain target functions in their children.
+    section_names = get_section_names(node)
+    if not section_names:
+        return node, False
+
+    updates: dict[str, list[cst.CSTNode] | cst.CSTNode] = {}
+    found_any_target = False
+
+    for section in section_names:
+        original_content = getattr(node, section, None)
+        if isinstance(original_content, (list, tuple)):
+            new_children = []
+            section_found_target = False
+            for child in original_content:
+                filtered, found_target = prune_cst_for_code_hashing(child, target_functions, prefix)
+                if filtered:
+                    new_children.append(filtered)
+                section_found_target |= found_target
+
+            if section_found_target:
+                found_any_target = True
+                updates[section] = new_children
+        elif original_content is not None:
+            filtered, found_target = prune_cst_for_code_hashing(original_content, target_functions, prefix)
+            if found_target:
+                found_any_target = True
+                if filtered:
+                    updates[section] = filtered
+
+    if not found_any_target:
+        return None, False
+
+    return (node.with_changes(**updates) if updates else node), True
+
+
 def prune_cst_for_read_only_code(  # noqa: PLR0911
     node: cst.CSTNode,
     target_functions: set[str],
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -16,7 +16,7 @@
 from enum import Enum, IntEnum
 from pathlib import Path
 from re import Pattern
-from typing import Annotated, Optional, cast
+from typing import Annotated, cast
 
 from jedi.api.classes import Name
 from pydantic import AfterValidator, BaseModel, ConfigDict, Field
@@ -77,10 +77,10 @@ class BestOptimization(BaseModel):
     candidate: OptimizedCandidate
     helper_functions: list[FunctionSource]
     runtime: int
-    replay_performance_gain: Optional[dict[BenchmarkKey, float]] = None
+    replay_performance_gain: dict[BenchmarkKey, float] | None = None
     winning_behavioral_test_results: TestResults
     winning_benchmarking_test_results: TestResults
-    winning_replay_benchmarking_test_results: Optional[TestResults] = None
+    winning_replay_benchmarking_test_results: TestResults | None = None
 
 
 @dataclass(frozen=True)
@@ -136,7 +136,7 @@ def to_dict(self) -> dict[str, list[dict[str, any]]]:
 
 class CodeString(BaseModel):
     code: Annotated[str, AfterValidator(validate_python_code)]
-    file_path: Optional[Path] = None
+    file_path: Path | None = None
 
 
 class CodeStringsMarkdown(BaseModel):
@@ -157,6 +157,7 @@ class CodeOptimizationContext(BaseModel):
     testgen_context_code: str = ""
     read_writable_code: str = Field(min_length=1)
     read_only_context_code: str = ""
+    hashing_code_context: str = ""
     helper_functions: list[FunctionSource]
     preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]]
 
@@ -165,14 +166,15 @@ class CodeContextType(str, Enum):
     READ_WRITABLE = "READ_WRITABLE"
     READ_ONLY = "READ_ONLY"
     TESTGEN = "TESTGEN"
+    HASHING = "HASHING"
 
 
 class OptimizedCandidateResult(BaseModel):
     max_loop_count: int
     best_test_runtime: int
     behavior_test_results: TestResults
     benchmarking_test_results: TestResults
-    replay_benchmarking_test_results: Optional[dict[BenchmarkKey, TestResults]] = None
+    replay_benchmarking_test_results: dict[BenchmarkKey, TestResults] | None = None
     optimization_candidate_index: int
     total_candidate_timing: int
 
@@ -192,10 +194,10 @@ class GeneratedTestsList(BaseModel):
 class TestFile(BaseModel):
     instrumented_behavior_file_path: Path
     benchmarking_file_path: Path = None
-    original_file_path: Optional[Path] = None
-    original_source: Optional[str] = None
+    original_file_path: Path | None = None
+    original_source: str | None = None
     test_type: TestType
-    tests_in_file: Optional[list[TestsInFile]] = None
+    tests_in_file: list[TestsInFile] | None = None
 
 
 class TestFiles(BaseModel):
@@ -238,13 +240,13 @@ def __len__(self) -> int:
 
 class OptimizationSet(BaseModel):
     control: list[OptimizedCandidate]
-    experiment: Optional[list[OptimizedCandidate]]
+    experiment: list[OptimizedCandidate] | None
 
 
 @dataclass(frozen=True)
 class TestsInFile:
     test_file: Path
-    test_class: Optional[str]
+    test_class: str | None
     test_function: str
     test_type: TestType
 
@@ -277,10 +279,10 @@ class FunctionParent:
 class OriginalCodeBaseline(BaseModel):
     behavioral_test_results: TestResults
     benchmarking_test_results: TestResults
-    replay_benchmarking_test_results: Optional[dict[BenchmarkKey, TestResults]] = None
+    replay_benchmarking_test_results: dict[BenchmarkKey, TestResults] | None = None
     line_profile_results: dict
     runtime: int
-    coverage_results: Optional[CoverageData]
+    coverage_results: CoverageData | None
 
 
 class CoverageStatus(Enum):
@@ -299,7 +301,7 @@ class CoverageData:
     graph: dict[str, dict[str, Collection[object]]]
     code_context: CodeOptimizationContext
     main_func_coverage: FunctionCoverage
-    dependent_func_coverage: Optional[FunctionCoverage]
+    dependent_func_coverage: FunctionCoverage | None
     status: CoverageStatus
     blank_re: Pattern[str] = re.compile(r"\s*(#|$)")
     else_re: Pattern[str] = re.compile(r"\s*else\s*:\s*(#|$)")
@@ -407,10 +409,10 @@ def to_name(self) -> str:
 @dataclass(frozen=True)
 class InvocationId:
     test_module_path: str  # The fully qualified name of the test module
-    test_class_name: Optional[str]  # The name of the class where the test is defined
-    test_function_name: Optional[str]  # The name of the test_function. Does not include the components of the file_name
+    test_class_name: str | None  # The name of the class where the test is defined
+    test_function_name: str | None  # The name of the test_function. Does not include the components of the file_name
     function_getting_tested: str
-    iteration_id: Optional[str]
+    iteration_id: str | None
 
     # test_module_path:TestSuiteClass.test_function_name:function_tested:iteration_id
     def id(self) -> str:
@@ -421,7 +423,7 @@ def id(self) -> str:
         )
 
     @staticmethod
-    def from_str_id(string_id: str, iteration_id: Optional[str] = None) -> InvocationId:
+    def from_str_id(string_id: str, iteration_id: str | None = None) -> InvocationId:
         components = string_id.split(":")
         assert len(components) == 4
         second_components = components[1].split(".")
@@ -446,13 +448,13 @@ class FunctionTestInvocation:
     id: InvocationId  # The fully qualified name of the function invocation (id)
     file_name: Path  # The file where the test is defined
     did_pass: bool  # Whether the test this function invocation was part of, passed or failed
-    runtime: Optional[int]  # Time in nanoseconds
+    runtime: int | None  # Time in nanoseconds
     test_framework: str  # unittest or pytest
     test_type: TestType
-    return_value: Optional[object]  # The return value of the function invocation
-    timed_out: Optional[bool]
-    verification_type: Optional[str] = VerificationType.FUNCTION_CALL
-    stdout: Optional[str] = None
+    return_value: object | None  # The return value of the function invocation
+    timed_out: bool | None
+    verification_type: str | None = VerificationType.FUNCTION_CALL
+    stdout: str | None = None
 
     @property
     def unique_invocation_loop_id(self) -> str:
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import ast
-import git
 import concurrent.futures
 import os
 import subprocess
@@ -52,8 +51,6 @@
 from codeflash.code_utils.line_profile_utils import add_decorator_imports
 from codeflash.code_utils.static_analysis import get_first_top_level_function_or_method_ast
 from codeflash.code_utils.time_utils import humanize_runtime
-from codeflash.code_utils.env_utils import get_pr_number
-from codeflash.code_utils.git_utils import get_repo_owner_and_name
 from codeflash.context import code_context_extractor
 from codeflash.context.unused_definition_remover import detect_unused_helper_functions, revert_unused_helper_functions
 from codeflash.either import Failure, Success, is_successful
@@ -265,7 +262,7 @@ def optimize_function(self) -> Result[BestOptimization, str]:  # noqa: PLR0911
         # adding to control and experiment set but with same traceid
         best_optimization = None
         for _u, (candidates, exp_type) in enumerate(
-            zip([optimizations_set.control, optimizations_set.experiment], ["EXP0", "EXP1"])
+            zip([optimizations_set.control, optimizations_set.experiment], ["EXP0", "EXP1"], strict=False)
         ):
             if candidates is None:
                 continue
@@ -687,6 +684,7 @@ def get_code_optimization_context(self) -> Result[CodeOptimizationContext, str]:
                 testgen_context_code=new_code_ctx.testgen_context_code,
                 read_writable_code=new_code_ctx.read_writable_code,
                 read_only_context_code=new_code_ctx.read_only_context_code,
+                hashing_code_context=new_code_ctx.hashing_code_context,
                 helper_functions=new_code_ctx.helper_functions,  # only functions that are read writable
                 preexisting_objects=new_code_ctx.preexisting_objects,
             )
@@ -1283,7 +1281,7 @@ def generate_and_instrument_tests(
                 test_perf_path,
             )
             for test_index, (test_path, test_perf_path) in enumerate(
-                zip(generated_test_paths, generated_perf_test_paths)
+                zip(generated_test_paths, generated_perf_test_paths, strict=False)
             )
         ]