Skip to content

Commit 8e516fc

Browse files
committed
critic
1 parent 04d31b5 commit 8e516fc

File tree

5 files changed

+247
-14
lines changed

5 files changed

+247
-14
lines changed

codeflash/code_utils/config_consts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@
1111
MIN_TESTCASE_PASSED_THRESHOLD = 6
1212
REPEAT_OPTIMIZATION_PROBABILITY = 0.1
1313
DEFAULT_IMPORTANCE_THRESHOLD = 0.001
14+
MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD = 0.10 # 10% minimum improvement for async throughput

codeflash/optimization/function_optimizer.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,13 @@
7777
TestType,
7878
)
7979
from codeflash.result.create_pr import check_create_pr, existing_tests_source_for
80-
from codeflash.result.critic import coverage_critic, performance_gain, quantity_of_tests_critic, speedup_critic
80+
from codeflash.result.critic import (
81+
coverage_critic,
82+
performance_gain,
83+
quantity_of_tests_critic,
84+
speedup_critic,
85+
throughput_gain,
86+
)
8187
from codeflash.result.explanation import Explanation
8288
from codeflash.telemetry.posthog_cf import ph
8389
from codeflash.verification.concolic_testing import generate_concolic_tests
@@ -566,7 +572,11 @@ def determine_best_candidate(
566572
tree = Tree(f"Candidate #{candidate_index} - Runtime Information")
567573
benchmark_tree = None
568574
if speedup_critic(
569-
candidate_result, original_code_baseline.runtime, best_runtime_until_now=None
575+
candidate_result,
576+
original_code_baseline.runtime,
577+
best_runtime_until_now=None,
578+
original_async_throughput=original_code_baseline.async_throughput,
579+
best_throughput_until_now=None,
570580
) and quantity_of_tests_critic(candidate_result):
571581
tree.add("This candidate is faster than the original code. 🚀") # TODO: Change this description
572582
tree.add(f"Original summed runtime: {humanize_runtime(original_code_baseline.runtime)}")
@@ -577,6 +587,19 @@ def determine_best_candidate(
577587
)
578588
tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
579589
tree.add(f"Speedup ratio: {perf_gain + 1:.3f}X")
590+
logger.info(f"orig_async_throughput: {original_code_baseline.async_throughput}")
591+
logger.info(f"candidate_result.async_throughput: {candidate_result.async_throughput}")
592+
if (
593+
original_code_baseline.async_throughput is not None
594+
and candidate_result.async_throughput is not None
595+
):
596+
throughput_gain_value = throughput_gain(
597+
original_throughput=original_code_baseline.async_throughput,
598+
optimized_throughput=candidate_result.async_throughput,
599+
)
600+
tree.add(f"Original async throughput: {original_code_baseline.async_throughput} executions")
601+
tree.add(f"Optimized async throughput: {candidate_result.async_throughput} executions")
602+
tree.add(f"Throughput improvement: {throughput_gain_value * 100:.1f}%")
580603
line_profile_test_results = self.line_profiler_step(
581604
code_context=code_context,
582605
original_helper_code=original_helper_code,
@@ -1509,10 +1532,12 @@ def establish_original_code_baseline(
15091532
for result in benchmarking_results.test_results:
15101533
if result.stdout:
15111534
all_stdout += result.stdout
1512-
1535+
logger.info("Calculating async function throughput from test output...")
1536+
logger.info(f"All stdout for async throughput calculation:\n{all_stdout}")
15131537
async_throughput = calculate_function_throughput_from_stdout(
15141538
all_stdout, self.function_to_optimize.function_name
15151539
)
1540+
logger.info(f"Original async function throughput: {async_throughput} calls/second")
15161541

15171542
if self.args.benchmark:
15181543
replay_benchmarking_test_results = benchmarking_results.group_by_benchmarks(
@@ -1680,7 +1705,6 @@ def run_optimized_candidate(
16801705
if result.stdout:
16811706
all_stdout += result.stdout
16821707

1683-
16841708
candidate_async_throughput = calculate_function_throughput_from_stdout(
16851709
all_stdout, self.function_to_optimize.function_name
16861710
)

codeflash/result/critic.py

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
COVERAGE_THRESHOLD,
99
MIN_IMPROVEMENT_THRESHOLD,
1010
MIN_TESTCASE_PASSED_THRESHOLD,
11+
MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD,
1112
)
1213
from codeflash.models.models import TestType
1314

@@ -25,31 +26,73 @@ def performance_gain(*, original_runtime_ns: int, optimized_runtime_ns: int) ->
2526
return (original_runtime_ns - optimized_runtime_ns) / optimized_runtime_ns
2627

2728

29+
def throughput_gain(*, original_throughput: int, optimized_throughput: int) -> float:
30+
"""Calculate the throughput gain of an optimized code over the original code.
31+
32+
This value multiplied by 100 gives the percentage improvement in throughput.
33+
For throughput, higher values are better (more executions per time period).
34+
"""
35+
if original_throughput == 0:
36+
return 0.0
37+
return (optimized_throughput - original_throughput) / original_throughput
38+
39+
2840
def speedup_critic(
2941
candidate_result: OptimizedCandidateResult,
3042
original_code_runtime: int,
3143
best_runtime_until_now: int | None,
3244
*,
3345
disable_gh_action_noise: bool = False,
46+
original_async_throughput: int | None = None,
47+
best_throughput_until_now: int | None = None,
3448
) -> bool:
3549
"""Take in a correct optimized Test Result and decide if the optimization should actually be surfaced to the user.
3650
37-
Ensure that the optimization is actually faster than the original code, above the noise floor.
38-
The noise floor is a function of the original code runtime. Currently, the noise floor is 2xMIN_IMPROVEMENT_THRESHOLD
39-
when the original runtime is less than 10 microseconds, and becomes MIN_IMPROVEMENT_THRESHOLD for any higher runtime.
40-
The noise floor is doubled when benchmarking on a (noisy) GitHub Action virtual instance, also we want to be more confident there.
51+
Evaluates both runtime performance and async throughput improvements.
52+
53+
For runtime performance:
54+
- Ensures the optimization is actually faster than the original code, above the noise floor.
55+
- The noise floor is a function of the original code runtime. Currently, the noise floor is 2xMIN_IMPROVEMENT_THRESHOLD
56+
when the original runtime is less than 10 microseconds, and becomes MIN_IMPROVEMENT_THRESHOLD for any higher runtime.
57+
- The noise floor is doubled when benchmarking on a (noisy) GitHub Action virtual instance.
58+
59+
For async throughput (when available):
60+
- Evaluates throughput improvements using MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD
61+
- Throughput improvements complement runtime improvements for async functions
4162
"""
63+
# Runtime performance evaluation
4264
noise_floor = 3 * MIN_IMPROVEMENT_THRESHOLD if original_code_runtime < 10000 else MIN_IMPROVEMENT_THRESHOLD
4365
if not disable_gh_action_noise and env_utils.is_ci():
4466
noise_floor = noise_floor * 2 # Increase the noise floor in GitHub Actions mode
4567

4668
perf_gain = performance_gain(
4769
original_runtime_ns=original_code_runtime, optimized_runtime_ns=candidate_result.best_test_runtime
4870
)
49-
if best_runtime_until_now is None:
50-
# collect all optimizations with this
51-
return bool(perf_gain > noise_floor)
52-
return bool(perf_gain > noise_floor and candidate_result.best_test_runtime < best_runtime_until_now)
71+
runtime_improved = perf_gain > noise_floor
72+
73+
# Check runtime comparison with best so far
74+
runtime_is_best = best_runtime_until_now is None or candidate_result.best_test_runtime < best_runtime_until_now
75+
76+
# Async throughput evaluation (if throughput data is available)
77+
throughput_improved = True # Default to True if no throughput data
78+
throughput_is_best = True # Default to True if no throughput data
79+
80+
if original_async_throughput is not None and candidate_result.async_throughput is not None:
81+
if original_async_throughput > 0: # Avoid division by zero
82+
throughput_gain_value = throughput_gain(
83+
original_throughput=original_async_throughput,
84+
optimized_throughput=candidate_result.async_throughput
85+
)
86+
throughput_improved = throughput_gain_value > MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD
87+
logger.debug(f"Async throughput gain: {throughput_gain_value * 100:.1f}% (original: {original_async_throughput}, optimized: {candidate_result.async_throughput})")
88+
89+
throughput_is_best = best_throughput_until_now is None or candidate_result.async_throughput > best_throughput_until_now
90+
91+
# For async functions with throughput data, both runtime and throughput should improve
92+
# For sync functions or when throughput data is unavailable, only runtime matters
93+
if original_async_throughput is not None and candidate_result.async_throughput is not None:
94+
return runtime_improved and runtime_is_best and throughput_improved and throughput_is_best
95+
return runtime_improved and runtime_is_best
5396

5497

5598
def quantity_of_tests_critic(candidate_result: OptimizedCandidateResult | OriginalCodeBaseline) -> bool:

codeflash/verification/parse_test_output.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,13 @@ def calculate_function_throughput_from_stdout(stdout: str, function_name: str) -
9494

9595
# Count completed executions for the specific function only
9696
function_throughput = 0
97-
97+
logger.info(f"Total start matches: {len(start_matches)}, Total end matches: {len(end_matches)}")
9898
for start_match in start_matches:
9999
# Check if this execution is for the function we're interested in and has a matching end tag
100100
# function_name is at index 2 in the match tuple
101+
logger.info(f"Start match: {start_match}")
102+
logger.info(f"End matches: {end_matches_set}")
103+
logger.info(f"Function name: {function_name}")
101104
if start_match in end_matches_set and len(start_match) > 2 and start_match[2] == function_name:
102105
function_throughput += 1
103106

tests/test_critic.py

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
TestResults,
1515
TestType,
1616
)
17-
from codeflash.result.critic import coverage_critic, performance_gain, quantity_of_tests_critic, speedup_critic
17+
from codeflash.result.critic import (
18+
coverage_critic,
19+
performance_gain,
20+
quantity_of_tests_critic,
21+
speedup_critic,
22+
throughput_gain,
23+
)
1824

1925

2026
def test_performance_gain() -> None:
@@ -429,3 +435,159 @@ def test_coverage_critic() -> None:
429435
)
430436

431437
assert coverage_critic(unittest_coverage, "unittest") is True
438+
439+
440+
def test_throughput_gain() -> None:
441+
"""Test throughput_gain calculation."""
442+
# Test basic throughput improvement
443+
assert throughput_gain(original_throughput=100, optimized_throughput=150) == 0.5 # 50% improvement
444+
445+
# Test no improvement
446+
assert throughput_gain(original_throughput=100, optimized_throughput=100) == 0.0
447+
448+
# Test regression
449+
assert throughput_gain(original_throughput=100, optimized_throughput=80) == -0.2 # 20% regression
450+
451+
# Test zero original throughput (edge case)
452+
assert throughput_gain(original_throughput=0, optimized_throughput=50) == 0.0
453+
454+
# Test large improvement
455+
assert throughput_gain(original_throughput=50, optimized_throughput=200) == 3.0 # 300% improvement
456+
457+
458+
def test_speedup_critic_with_async_throughput() -> None:
459+
"""Test speedup_critic with async throughput evaluation."""
460+
original_code_runtime = 10000 # 10 microseconds
461+
original_async_throughput = 100
462+
463+
# Test case 1: Both runtime and throughput improve significantly
464+
candidate_result = OptimizedCandidateResult(
465+
max_loop_count=5,
466+
best_test_runtime=8000, # 20% runtime improvement
467+
behavior_test_results=TestResults(),
468+
benchmarking_test_results=TestResults(),
469+
optimization_candidate_index=0,
470+
total_candidate_timing=8000,
471+
async_throughput=120, # 20% throughput improvement
472+
)
473+
474+
assert speedup_critic(
475+
candidate_result=candidate_result,
476+
original_code_runtime=original_code_runtime,
477+
best_runtime_until_now=None,
478+
original_async_throughput=original_async_throughput,
479+
best_throughput_until_now=None,
480+
disable_gh_action_noise=True
481+
)
482+
483+
# Test case 2: Runtime improves but throughput doesn't meet threshold
484+
candidate_result = OptimizedCandidateResult(
485+
max_loop_count=5,
486+
best_test_runtime=8000, # 20% runtime improvement
487+
behavior_test_results=TestResults(),
488+
benchmarking_test_results=TestResults(),
489+
optimization_candidate_index=0,
490+
total_candidate_timing=8000,
491+
async_throughput=105, # Only 5% throughput improvement (below 10% threshold)
492+
)
493+
494+
assert not speedup_critic(
495+
candidate_result=candidate_result,
496+
original_code_runtime=original_code_runtime,
497+
best_runtime_until_now=None,
498+
original_async_throughput=original_async_throughput,
499+
best_throughput_until_now=None,
500+
disable_gh_action_noise=True
501+
)
502+
503+
# Test case 3: Throughput improves but runtime doesn't meet threshold
504+
candidate_result = OptimizedCandidateResult(
505+
max_loop_count=5,
506+
best_test_runtime=9800, # Only 2% runtime improvement (below 5% threshold)
507+
behavior_test_results=TestResults(),
508+
benchmarking_test_results=TestResults(),
509+
optimization_candidate_index=0,
510+
total_candidate_timing=9800,
511+
async_throughput=120, # 20% throughput improvement
512+
)
513+
514+
assert not speedup_critic(
515+
candidate_result=candidate_result,
516+
original_code_runtime=original_code_runtime,
517+
best_runtime_until_now=None,
518+
original_async_throughput=original_async_throughput,
519+
best_throughput_until_now=None,
520+
disable_gh_action_noise=True
521+
)
522+
523+
# Test case 4: No throughput data - should fall back to runtime-only evaluation
524+
candidate_result = OptimizedCandidateResult(
525+
max_loop_count=5,
526+
best_test_runtime=8000, # 20% runtime improvement
527+
behavior_test_results=TestResults(),
528+
benchmarking_test_results=TestResults(),
529+
optimization_candidate_index=0,
530+
total_candidate_timing=8000,
531+
async_throughput=None, # No throughput data
532+
)
533+
534+
assert speedup_critic(
535+
candidate_result=candidate_result,
536+
original_code_runtime=original_code_runtime,
537+
best_runtime_until_now=None,
538+
original_async_throughput=None, # No original throughput data
539+
best_throughput_until_now=None,
540+
disable_gh_action_noise=True
541+
)
542+
543+
# Test case 5: Test best_throughput_until_now comparison
544+
candidate_result = OptimizedCandidateResult(
545+
max_loop_count=5,
546+
best_test_runtime=8000, # 20% runtime improvement
547+
behavior_test_results=TestResults(),
548+
benchmarking_test_results=TestResults(),
549+
optimization_candidate_index=0,
550+
total_candidate_timing=8000,
551+
async_throughput=115, # 15% throughput improvement
552+
)
553+
554+
# Should pass when no best throughput yet
555+
assert speedup_critic(
556+
candidate_result=candidate_result,
557+
original_code_runtime=original_code_runtime,
558+
best_runtime_until_now=None,
559+
original_async_throughput=original_async_throughput,
560+
best_throughput_until_now=None,
561+
disable_gh_action_noise=True
562+
)
563+
564+
# Should fail when there's a better throughput already
565+
assert not speedup_critic(
566+
candidate_result=candidate_result,
567+
original_code_runtime=original_code_runtime,
568+
best_runtime_until_now=None,
569+
original_async_throughput=original_async_throughput,
570+
best_throughput_until_now=120, # Better throughput already exists
571+
disable_gh_action_noise=True
572+
)
573+
574+
# Test case 6: Zero original throughput (edge case)
575+
candidate_result = OptimizedCandidateResult(
576+
max_loop_count=5,
577+
best_test_runtime=8000, # 20% runtime improvement
578+
behavior_test_results=TestResults(),
579+
benchmarking_test_results=TestResults(),
580+
optimization_candidate_index=0,
581+
total_candidate_timing=8000,
582+
async_throughput=50,
583+
)
584+
585+
# Should pass when original throughput is 0 (throughput evaluation skipped)
586+
assert speedup_critic(
587+
candidate_result=candidate_result,
588+
original_code_runtime=original_code_runtime,
589+
best_runtime_until_now=None,
590+
original_async_throughput=0, # Zero original throughput
591+
best_throughput_until_now=None,
592+
disable_gh_action_noise=True
593+
)

0 commit comments

Comments
 (0)