Skip to content

Commit e9746c9

Browse files
committed
batch code hash check
1 parent 41378a0 commit e9746c9

File tree

1 file changed

+177
-40
lines changed

1 file changed

+177
-40
lines changed

codeflash/discovery/functions_to_optimize.py

Lines changed: 177 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import ast
4+
import hashlib
45
import os
56
import random
67
import warnings
@@ -145,15 +146,56 @@ def qualified_name(self) -> str:
145146
def qualified_name_with_modules_from_root(self, project_root_path: Path) -> str:
146147
return f"{module_name_from_file_path(self.file_path, project_root_path)}.{self.qualified_name}"
147148

149+
def get_code_context_hash(self) -> str:
150+
"""Generate a SHA-256 hash representing the code context of this function.
151+
152+
This hash includes the function's code content, file path, and qualified name
153+
to uniquely identify the function for optimization tracking.
154+
"""
155+
try:
156+
with open(self.file_path, 'r', encoding='utf-8') as f:
157+
file_content = f.read()
158+
159+
# Extract the function's code content
160+
lines = file_content.splitlines()
161+
if self.starting_line is not None and self.ending_line is not None:
162+
# Use line numbers if available (1-indexed to 0-indexed)
163+
function_content = '\n'.join(lines[self.starting_line - 1:self.ending_line])
164+
else:
165+
# Fallback: use the entire file content if line numbers aren't available
166+
function_content = file_content
167+
168+
# Create a context string that includes:
169+
# - File path (relative to make it portable)
170+
# - Qualified function name
171+
# - Function code content
172+
context_parts = [
173+
str(self.file_path.name), # Just filename for portability
174+
self.qualified_name,
175+
function_content.strip()
176+
]
177+
178+
context_string = '\n---\n'.join(context_parts)
179+
180+
# Generate SHA-256 hash
181+
return hashlib.sha256(context_string.encode('utf-8')).hexdigest()
182+
183+
except (OSError, IOError) as e:
184+
logger.warning(f"Could not read file {self.file_path} for hashing: {e}")
185+
# Fallback hash using available metadata
186+
fallback_string = f"{self.file_path.name}:{self.qualified_name}"
187+
return hashlib.sha256(fallback_string.encode('utf-8')).hexdigest()
188+
189+
148190
def get_functions_to_optimize(
149-
optimize_all: str | None,
150-
replay_test: str | None,
151-
file: Path | None,
152-
only_get_this_function: str | None,
153-
test_cfg: TestConfig,
154-
ignore_paths: list[Path],
155-
project_root: Path,
156-
module_root: Path,
191+
optimize_all: str | None,
192+
replay_test: str | None,
193+
file: Path | None,
194+
only_get_this_function: str | None,
195+
test_cfg: TestConfig,
196+
ignore_paths: list[Path],
197+
project_root: Path,
198+
module_root: Path,
157199
) -> tuple[dict[Path, list[FunctionToOptimize]], int]:
158200
assert sum([bool(optimize_all), bool(replay_test), bool(file)]) <= 1, (
159201
"Only one of optimize_all, replay_test, or file should be provided"
@@ -186,7 +228,7 @@ def get_functions_to_optimize(
186228
found_function = None
187229
for fn in functions.get(file, []):
188230
if only_function_name == fn.function_name and (
189-
class_name is None or class_name == fn.top_level_parent_name
231+
class_name is None or class_name == fn.top_level_parent_name
190232
):
191233
found_function = fn
192234
if found_function is None:
@@ -224,8 +266,8 @@ def get_functions_within_git_diff() -> dict[str, list[FunctionToOptimize]]:
224266
function_to_optimize
225267
for function_to_optimize in function_lines.functions
226268
if (start_line := function_to_optimize.starting_line) is not None
227-
and (end_line := function_to_optimize.ending_line) is not None
228-
and any(start_line <= line <= end_line for line in modified_lines[path_str])
269+
and (end_line := function_to_optimize.ending_line) is not None
270+
and any(start_line <= line <= end_line for line in modified_lines[path_str])
229271
]
230272
return modified_functions
231273

@@ -258,7 +300,7 @@ def find_all_functions_in_file(file_path: Path) -> dict[Path, list[FunctionToOpt
258300

259301

260302
def get_all_replay_test_functions(
261-
replay_test: Path, test_cfg: TestConfig, project_root_path: Path
303+
replay_test: Path, test_cfg: TestConfig, project_root_path: Path
262304
) -> dict[Path, list[FunctionToOptimize]]:
263305
function_tests = discover_unit_tests(test_cfg, discover_only_these_tests=[replay_test])
264306
# Get the absolute file paths for each function, excluding class name if present
@@ -273,7 +315,7 @@ def get_all_replay_test_functions(
273315
class_name = (
274316
module_path_parts[-1]
275317
if module_path_parts
276-
and is_class_defined_in_file(
318+
and is_class_defined_in_file(
277319
module_path_parts[-1], Path(project_root_path, *module_path_parts[:-1]).with_suffix(".py")
278320
)
279321
else None
@@ -323,7 +365,8 @@ def ignored_submodule_paths(module_root: str) -> list[str]:
323365

324366
class TopLevelFunctionOrMethodVisitor(ast.NodeVisitor):
325367
def __init__(
326-
self, file_name: Path, function_or_method_name: str, class_name: str | None = None, line_no: int | None = None
368+
self, file_name: Path, function_or_method_name: str, class_name: str | None = None,
369+
line_no: int | None = None
327370
) -> None:
328371
self.file_name = file_name
329372
self.class_name = class_name
@@ -354,13 +397,13 @@ def visit_ClassDef(self, node: ast.ClassDef) -> None:
354397
if isinstance(body_node, ast.FunctionDef) and body_node.name == self.function_name:
355398
self.is_top_level = True
356399
if any(
357-
isinstance(decorator, ast.Name) and decorator.id == "classmethod"
358-
for decorator in body_node.decorator_list
400+
isinstance(decorator, ast.Name) and decorator.id == "classmethod"
401+
for decorator in body_node.decorator_list
359402
):
360403
self.is_classmethod = True
361404
elif any(
362-
isinstance(decorator, ast.Name) and decorator.id == "staticmethod"
363-
for decorator in body_node.decorator_list
405+
isinstance(decorator, ast.Name) and decorator.id == "staticmethod"
406+
for decorator in body_node.decorator_list
364407
):
365408
self.is_staticmethod = True
366409
return
@@ -369,13 +412,13 @@ def visit_ClassDef(self, node: ast.ClassDef) -> None:
369412
# This way, if we don't have the class name, we can still find the static method
370413
for body_node in node.body:
371414
if (
372-
isinstance(body_node, ast.FunctionDef)
373-
and body_node.name == self.function_name
374-
and body_node.lineno in {self.line_no, self.line_no + 1}
375-
and any(
376-
isinstance(decorator, ast.Name) and decorator.id == "staticmethod"
377-
for decorator in body_node.decorator_list
378-
)
415+
isinstance(body_node, ast.FunctionDef)
416+
and body_node.name == self.function_name
417+
and body_node.lineno in {self.line_no, self.line_no + 1}
418+
and any(
419+
isinstance(decorator, ast.Name) and decorator.id == "staticmethod"
420+
for decorator in body_node.decorator_list
421+
)
379422
):
380423
self.is_staticmethod = True
381424
self.is_top_level = True
@@ -386,7 +429,7 @@ def visit_ClassDef(self, node: ast.ClassDef) -> None:
386429

387430

388431
def inspect_top_level_functions_or_methods(
389-
file_name: Path, function_or_method_name: str, class_name: str | None = None, line_no: int | None = None
432+
file_name: Path, function_or_method_name: str, class_name: str | None = None, line_no: int | None = None
390433
) -> FunctionProperties:
391434
with open(file_name, encoding="utf8") as file:
392435
try:
@@ -408,13 +451,93 @@ def inspect_top_level_functions_or_methods(
408451
)
409452

410453

454+
def check_optimization_status(
455+
functions_by_file: dict[Path, list[FunctionToOptimize]],
456+
owner: str,
457+
repo: str,
458+
pr_number: int
459+
) -> tuple[dict[Path, list[FunctionToOptimize]], int]:
460+
"""Check which functions have already been optimized and filter them out.
461+
462+
This function calls the optimization API to:
463+
1. Check which functions are already optimized
464+
2. Log new function hashes to the database
465+
3. Return only functions that need optimization
466+
467+
Args:
468+
functions_by_file: Dictionary mapping file paths to lists of functions
469+
owner: Repository owner
470+
repo: Repository name
471+
pr_number: Pull request number
472+
473+
Returns:
474+
Tuple of (filtered_functions_dict, remaining_count)
475+
"""
476+
import requests
477+
478+
# Build the code_contexts dictionary for the API call
479+
code_contexts = {}
480+
path_to_function_map = {}
481+
482+
for file_path, functions in functions_by_file.items():
483+
for func in functions:
484+
func_hash = func.get_code_context_hash()
485+
# Use a unique path identifier that includes function info
486+
path_key = f"{file_path}:{func.qualified_name}"
487+
code_contexts[path_key] = func_hash
488+
path_to_function_map[path_key] = (file_path, func)
489+
490+
if not code_contexts:
491+
return {}, 0
492+
493+
try:
494+
# Call the optimization check API
495+
response = requests.post(
496+
"http://your-api-endpoint/is_code_being_optimized_again", # Replace with actual endpoint
497+
json={
498+
"owner": owner,
499+
"repo": repo,
500+
"pr_number": str(pr_number),
501+
"code_contexts": code_contexts
502+
},
503+
timeout=30
504+
)
505+
response.raise_for_status()
506+
507+
result = response.json()
508+
already_optimized_paths = set(result.get("already_optimized_paths", []))
509+
510+
logger.info(f"Found {len(already_optimized_paths)} already optimized functions")
511+
512+
# Filter out already optimized functions
513+
filtered_functions = defaultdict(list)
514+
remaining_count = 0
515+
516+
for path_key, (file_path, func) in path_to_function_map.items():
517+
if path_key not in already_optimized_paths:
518+
filtered_functions[file_path].append(func)
519+
remaining_count += 1
520+
521+
return dict(filtered_functions), remaining_count
522+
523+
except Exception as e:
524+
logger.warning(f"Failed to check optimization status: {e}")
525+
logger.info("Proceeding with all functions (optimization check failed)")
526+
# Return all functions if API call fails
527+
total_count = sum(len(funcs) for funcs in functions_by_file.values())
528+
return functions_by_file, total_count
529+
530+
411531
def filter_functions(
412-
modified_functions: dict[Path, list[FunctionToOptimize]],
413-
tests_root: Path,
414-
ignore_paths: list[Path],
415-
project_root: Path,
416-
module_root: Path,
417-
disable_logs: bool = False,
532+
modified_functions: dict[Path, list[FunctionToOptimize]],
533+
tests_root: Path,
534+
ignore_paths: list[Path],
535+
project_root: Path,
536+
module_root: Path,
537+
disable_logs: bool = False,
538+
owner: str | None = None,
539+
repo: str | None = None,
540+
pr_number: int | None = None,
418541
) -> tuple[dict[Path, list[FunctionToOptimize]], int]:
419542
blocklist_funcs = get_blocklisted_functions()
420543
# Remove any function that we don't want to optimize
@@ -432,19 +555,20 @@ def filter_functions(
432555
submodule_ignored_paths_count: int = 0
433556
tests_root_str = str(tests_root)
434557
module_root_str = str(module_root)
558+
435559
# We desperately need Python 3.10+ only support to make this code readable with structural pattern matching
436560
for file_path_path, functions in modified_functions.items():
437561
file_path = str(file_path_path)
438562
if file_path.startswith(tests_root_str + os.sep):
439563
test_functions_removed_count += len(functions)
440564
continue
441565
if file_path in ignore_paths or any(
442-
file_path.startswith(str(ignore_path) + os.sep) for ignore_path in ignore_paths
566+
file_path.startswith(str(ignore_path) + os.sep) for ignore_path in ignore_paths
443567
):
444568
ignore_paths_removed_count += 1
445569
continue
446570
if file_path in submodule_paths or any(
447-
file_path.startswith(str(submodule_path) + os.sep) for submodule_path in submodule_paths
571+
file_path.startswith(str(submodule_path) + os.sep) for submodule_path in submodule_paths
448572
):
449573
submodule_ignored_paths_count += 1
450574
continue
@@ -464,13 +588,25 @@ def filter_functions(
464588
function
465589
for function in functions
466590
if not (
467-
function.file_path.name in blocklist_funcs
468-
and function.qualified_name in blocklist_funcs[function.file_path.name]
591+
function.file_path.name in blocklist_funcs
592+
and function.qualified_name in blocklist_funcs[function.file_path.name]
469593
)
470594
]
471595
filtered_modified_functions[file_path] = functions
472596
functions_count += len(functions)
473597

598+
# Convert to Path keys for optimization check
599+
path_based_functions = {Path(k): v for k, v in filtered_modified_functions.items() if v}
600+
601+
# Check optimization status if repository info is provided
602+
already_optimized_count = 0
603+
if owner and repo and pr_number is not None:
604+
path_based_functions, functions_count = check_optimization_status(
605+
path_based_functions, owner, repo, pr_number
606+
)
607+
initial_count = sum(len(funcs) for funcs in filtered_modified_functions.values())
608+
already_optimized_count = initial_count - functions_count
609+
474610
if not disable_logs:
475611
log_info = {
476612
f"{test_functions_removed_count} test function{'s' if test_functions_removed_count != 1 else ''}": test_functions_removed_count,
@@ -479,13 +615,14 @@ def filter_functions(
479615
f"{non_modules_removed_count} function{'s' if non_modules_removed_count != 1 else ''} outside module-root": non_modules_removed_count,
480616
f"{ignore_paths_removed_count} file{'s' if ignore_paths_removed_count != 1 else ''} from ignored paths": ignore_paths_removed_count,
481617
f"{submodule_ignored_paths_count} file{'s' if submodule_ignored_paths_count != 1 else ''} from ignored submodules": submodule_ignored_paths_count,
618+
f"{already_optimized_count} already optimized function{'s' if already_optimized_count != 1 else ''}": already_optimized_count,
482619
}
483620
log_string = "\n".join([k for k, v in log_info.items() if v > 0])
484621
if log_string:
485622
logger.info(f"Ignoring: {log_string}")
486623
console.rule()
487624

488-
return {Path(k): v for k, v in filtered_modified_functions.items() if v}, functions_count
625+
return path_based_functions, functions_count
489626

490627

491628
def filter_files_optimized(file_path: Path, tests_root: Path, ignore_paths: list[Path], module_root: Path) -> bool:
@@ -505,8 +642,8 @@ def filter_files_optimized(file_path: Path, tests_root: Path, ignore_paths: list
505642
if submodule_paths is None:
506643
submodule_paths = ignored_submodule_paths(module_root)
507644
return not (
508-
file_path in submodule_paths
509-
or any(file_path.is_relative_to(submodule_path) for submodule_path in submodule_paths)
645+
file_path in submodule_paths
646+
or any(file_path.is_relative_to(submodule_path) for submodule_path in submodule_paths)
510647
)
511648

512649

@@ -515,4 +652,4 @@ def function_has_return_statement(function_node: FunctionDef | AsyncFunctionDef)
515652

516653

517654
def function_is_a_property(function_node: FunctionDef | AsyncFunctionDef) -> bool:
518-
return any(isinstance(node, ast.Name) and node.id == "property" for node in function_node.decorator_list)
655+
return any(isinstance(node, ast.Name) and node.id == "property" for node in function_node.decorator_list)

0 commit comments

Comments
 (0)