Skip to content

Commit b722e41

Browse files
Serhan-Asadclaude
andcommitted
fix: apply cost extraction fix to operation_log.py and add passing tests (#555)
- operation_log.py: use _extract_cost_from_result/_extract_model_from_result helpers instead of hardcoded result[1]/result[2] (same bug as sync_orchestration.py) - Update test files to import and test the actual fixed helpers (29/29 pass) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d9ab380 commit b722e41

File tree

3 files changed

+841
-2
lines changed

3 files changed

+841
-2
lines changed

pdd/operation_log.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
336336
model = "unknown"
337337
if success and result:
338338
if isinstance(result, tuple) and len(result) >= 3:
339-
if isinstance(result[1], (int, float)): cost = float(result[1])
340-
if isinstance(result[2], str): model = str(result[2])
339+
from .sync_orchestration import _extract_cost_from_result, _extract_model_from_result
340+
cost = _extract_cost_from_result(operation, result)
341+
model = _extract_model_from_result(operation, result)
341342

342343
update_log_entry(entry, success=success, cost=cost, model=model, duration=duration, error=error_msg)
343344
if basename and language:
Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
"""Tests for issue #555: cost extraction reads wrong tuple index for generate/crash/fix/verify.
2+
3+
The bug: sync_orchestration.py lines 1857-1858 and 1889-1890 always read result[1] as cost
4+
and result[2] as model, but each operation returns a different tuple format:
5+
6+
- generate: (content, was_incremental, cost, model) — cost at index 2, model at index 3
7+
- crash: (success, final_code, final_program, attempts, cost, model) — cost at index 4, model at index 5
8+
- fix: (success, fixed_unit_test, fixed_code, attempts, total_cost, model_name) — cost at index 4, model at index 5
9+
- verify: (success, final_program, final_code, attempts, total_cost, model_name) — cost at index 4, model at index 5
10+
11+
Secondary bug: operation_log.py lines 339-340 has the same incorrect extraction logic.
12+
13+
Bool quirk: Python's bool subclasses int, so isinstance(False, (int, float)) returns True.
14+
For generate, result[1] is was_incremental (bool), which silently passes the isinstance check
15+
and cost becomes int(False)=0 or int(True)=1 instead of the actual cost at result[2].
16+
"""
17+
18+
import pytest
19+
20+
from pdd.sync_orchestration import _extract_cost_from_result, _extract_model_from_result
21+
22+
23+
class TestCostExtractionBudgetAccumulation:
24+
"""Test cost extraction via _extract_cost_from_result helper (fixes sync_orchestration.py:1857-1858).
25+
26+
The old buggy code always read result[1] as cost with no operation-specific branching.
27+
The fix dispatches to the correct index per operation.
28+
"""
29+
30+
def test_generate_cost_extraction(self):
31+
"""generate returns (content, was_incremental, cost, model) — cost is at index 2, not 1.
32+
33+
Bug: result[1] is was_incremental (bool). Since bool subclasses int,
34+
isinstance(False, (int, float)) is True → cost becomes 0 instead of 0.0421.
35+
"""
36+
# code_generator_main return format: (content, was_incremental, cost, model)
37+
result = ("generated code", False, 0.0421, "claude-sonnet-4-6")
38+
39+
cost = _extract_cost_from_result('generate', result)
40+
41+
assert cost == pytest.approx(0.0421), (
42+
f"generate cost should be {result[2]} (index 2) but got {cost}. "
43+
f"result[1] = {result[1]!r} (was_incremental) was read as cost due to bool⊂int."
44+
)
45+
46+
def test_crash_cost_extraction(self):
47+
"""crash returns (success, final_code, final_program, attempts, cost, model) — cost at index 4.
48+
49+
Bug: result[1] is final_code (str), isinstance fails → cost defaults to 0.0.
50+
"""
51+
# crash_main return format: (success, final_code, final_program, attempts, cost, model)
52+
result = (True, "code content", "program content", 2, 0.0315, "gpt-4o-mini")
53+
54+
cost = _extract_cost_from_result('crash', result)
55+
56+
assert cost == pytest.approx(0.0315), (
57+
f"crash cost should be {result[4]} (index 4) but got {cost}. "
58+
f"result[1] = {result[1]!r} (final_code string) is not a number."
59+
)
60+
61+
def test_fix_cost_extraction(self):
62+
"""fix returns (success, fixed_unit_test, fixed_code, attempts, total_cost, model) — cost at index 4.
63+
64+
Bug: result[1] is fixed_unit_test (str), isinstance fails → cost defaults to 0.0.
65+
"""
66+
# fix_main return format: (success, fixed_unit_test, fixed_code, attempts, total_cost, model_name)
67+
result = (True, "fixed test code", "fixed source code", 3, 0.0892, "claude-sonnet-4-6")
68+
69+
cost = _extract_cost_from_result('fix', result)
70+
71+
assert cost == pytest.approx(0.0892), (
72+
f"fix cost should be {result[4]} (index 4) but got {cost}. "
73+
f"result[1] = {result[1]!r} (fixed_unit_test string) is not a number."
74+
)
75+
76+
def test_verify_cost_extraction(self):
77+
"""verify returns (success, final_program, final_code, attempts, total_cost, model) — cost at index 4.
78+
79+
Bug: result[1] is final_program (str), isinstance fails → cost defaults to 0.0.
80+
"""
81+
# fix_verification_main return format: (success, final_program, final_code, attempts, total_cost, model_name)
82+
result = (True, "program content", "code content", 1, 0.0267, "gpt-4o-mini")
83+
84+
cost = _extract_cost_from_result('verify', result)
85+
86+
assert cost == pytest.approx(0.0267), (
87+
f"verify cost should be {result[4]} (index 4) but got {cost}. "
88+
f"result[1] = {result[1]!r} (final_program string) is not a number."
89+
)
90+
91+
92+
class TestCostExtractionLogging:
93+
"""Test cost/model extraction via helpers (fixes sync_orchestration.py:1885-1890 log entry).
94+
95+
The old buggy code read result[1] as cost and result[2] as model for all non-test operations,
96+
which is wrong for generate (cost=[2], model=[3]) and crash/fix/verify (cost=[4], model=[5]).
97+
"""
98+
99+
def test_generate_logging_cost_and_model(self):
100+
"""generate: cost should be result[2], model should be result[3].
101+
102+
Bug: Reads result[1] (was_incremental=False) as cost → 0 (bool→int).
103+
Reads result[2] (0.0421 float) as model → 'unknown' (not a string).
104+
"""
105+
result = ("generated code", False, 0.0421, "claude-sonnet-4-6")
106+
107+
actual_cost = _extract_cost_from_result('generate', result)
108+
model_name = _extract_model_from_result('generate', result)
109+
110+
assert actual_cost == pytest.approx(0.0421), (
111+
f"generate log cost should be {result[2]} but got {actual_cost}. "
112+
f"result[1] = {result[1]!r} (was_incremental bool) was misread as cost."
113+
)
114+
assert model_name == "claude-sonnet-4-6", (
115+
f"generate log model should be '{result[3]}' but got '{model_name}'. "
116+
f"result[2] = {result[2]!r} (cost float) was misread as model."
117+
)
118+
119+
def test_crash_logging_cost_and_model(self):
120+
"""crash: cost should be result[4], model should be result[5].
121+
122+
Bug: Reads result[1] (final_code str) as cost → 0.0 (isinstance fails).
123+
Reads result[2] (final_program str) as model → uses wrong string.
124+
"""
125+
result = (True, "code content", "program content", 2, 0.0315, "gpt-4o-mini")
126+
127+
actual_cost = _extract_cost_from_result('crash', result)
128+
model_name = _extract_model_from_result('crash', result)
129+
130+
assert actual_cost == pytest.approx(0.0315), (
131+
f"crash log cost should be {result[4]} but got {actual_cost}."
132+
)
133+
assert model_name == "gpt-4o-mini", (
134+
f"crash log model should be '{result[5]}' but got '{model_name}'. "
135+
f"result[2] = {result[2]!r} (final_program string) was misread as model."
136+
)
137+
138+
def test_fix_logging_cost_and_model(self):
139+
"""fix: cost should be result[4], model should be result[5].
140+
141+
Bug: Reads result[1] (fixed_unit_test str) as cost → 0.0 (isinstance fails).
142+
Reads result[2] (fixed_code str) as model → uses wrong string.
143+
"""
144+
result = (True, "fixed test code", "fixed source code", 3, 0.0892, "claude-sonnet-4-6")
145+
146+
actual_cost = _extract_cost_from_result('fix', result)
147+
model_name = _extract_model_from_result('fix', result)
148+
149+
assert actual_cost == pytest.approx(0.0892), (
150+
f"fix log cost should be {result[4]} but got {actual_cost}."
151+
)
152+
assert model_name == "claude-sonnet-4-6", (
153+
f"fix log model should be '{result[5]}' but got '{model_name}'. "
154+
f"result[2] = {result[2]!r} (fixed_code string) was misread as model."
155+
)
156+
157+
def test_verify_logging_cost_and_model(self):
158+
"""verify: cost should be result[4], model should be result[5].
159+
160+
Bug: Reads result[1] (final_program str) as cost → 0.0 (isinstance fails).
161+
Reads result[2] (final_code str) as model → uses wrong string.
162+
"""
163+
result = (True, "program content", "code content", 1, 0.0267, "gpt-4o-mini")
164+
165+
actual_cost = _extract_cost_from_result('verify', result)
166+
model_name = _extract_model_from_result('verify', result)
167+
168+
assert actual_cost == pytest.approx(0.0267), (
169+
f"verify log cost should be {result[4]} but got {actual_cost}."
170+
)
171+
assert model_name == "gpt-4o-mini", (
172+
f"verify log model should be '{result[5]}' but got '{model_name}'. "
173+
f"result[2] = {result[2]!r} (final_code string) was misread as model."
174+
)
175+
176+
177+
class TestBoolSubclassesIntQuirk:
178+
"""Test that bool values are not silently accepted as cost values.
179+
180+
Python's bool subclasses int: isinstance(False, int) is True.
181+
For generate, result[1] = was_incremental (bool). The fixed helper
182+
explicitly excludes bool with `not isinstance(val, bool)`.
183+
"""
184+
185+
def test_bool_false_not_accepted_as_cost(self):
186+
"""The fix must reject bool values — generate's was_incremental should not sneak through.
187+
188+
When was_incremental=False, the old code set cost to 0 (int value of False).
189+
The fix reads index 2 instead.
190+
"""
191+
# generate 4-tuple: (content, was_incremental, cost, model)
192+
result = ("generated code", False, 0.0421, "claude-sonnet-4-6")
193+
194+
cost = _extract_cost_from_result('generate', result)
195+
196+
assert not isinstance(cost, bool), (
197+
f"Cost = {cost!r} is a bool (was_incremental), not a real cost value."
198+
)
199+
assert cost == pytest.approx(0.0421), (
200+
f"Cost should be {result[2]} but got {cost}."
201+
)
202+
203+
def test_bool_true_not_misread_as_cost_one_dollar(self):
204+
"""When was_incremental=True, the fix must NOT record cost as $1.00.
205+
206+
True == 1 in Python, so float(True) = 1.0. The old code would record
207+
an incremental generate that costs $0.0053 as costing $1.00.
208+
"""
209+
# generate 4-tuple with was_incremental=True
210+
result = ("generated code", True, 0.0053, "gpt-4o-mini")
211+
212+
cost = _extract_cost_from_result('generate', result)
213+
214+
assert cost == pytest.approx(0.0053), (
215+
f"Cost should be {result[2]} but got {cost} (True → int 1). "
216+
f"was_incremental=True was interpreted as cost=$1.00."
217+
)
218+
219+
220+
class TestRegressionGuards:
221+
"""Ensure operations that already extract cost correctly continue to work.
222+
223+
These operations have cost at index 1, model at index 2:
224+
- example: (content, cost, model) — 3-tuple
225+
- test: (content, cost, model, agentic_success) — 4-tuple
226+
- test_extend: (content, cost, model, agentic_success) — 4-tuple
227+
"""
228+
229+
def test_example_cost_extraction(self):
230+
"""example returns (content, cost, model) — cost at index 1 works correctly."""
231+
result = ("example content", 0.0023, "gpt-4o-mini")
232+
233+
cost = _extract_cost_from_result('example', result)
234+
model = _extract_model_from_result('example', result)
235+
236+
assert cost == pytest.approx(0.0023)
237+
assert model == "gpt-4o-mini"
238+
239+
def test_test_cost_extraction(self):
240+
"""test returns (content, cost, model, agentic_success) — cost at index 1 works correctly."""
241+
result = ("test content", 0.0078, "claude-sonnet-4-6", True)
242+
243+
cost = _extract_cost_from_result('test', result)
244+
model = _extract_model_from_result('test', result)
245+
246+
assert cost == pytest.approx(0.0078)
247+
assert model == "claude-sonnet-4-6"
248+
249+
def test_test_extend_cost_extraction(self):
250+
"""test_extend returns same format as test — cost at index 1 works correctly."""
251+
result = ("test content", 0.0045, "gpt-4o-mini", False)
252+
253+
cost = _extract_cost_from_result('test_extend', result)
254+
model = _extract_model_from_result('test_extend', result)
255+
256+
assert cost == pytest.approx(0.0045)
257+
assert model == "gpt-4o-mini"
258+
259+
260+
class TestBudgetAccumulationAllOperations:
261+
"""E2E test: budget accumulation across all operation types in a sync loop.
262+
263+
With the bug, generate/crash/fix/verify costs are lost or wrong,
264+
so the total accumulated cost is much lower than actual spend.
265+
"""
266+
267+
def test_total_budget_with_all_operations(self):
268+
"""Simulate sync loop with all 6 operations — total cost should sum correctly.
269+
270+
With the old bug:
271+
- example: 0.01 (correct)
272+
- generate: 0 (was_incremental=False → bool treated as 0)
273+
- crash: 0.0 (string at [1] fails isinstance)
274+
- test: 0.02 (correct)
275+
- fix: 0.0 (string at [1] fails isinstance)
276+
- verify: 0.0 (string at [1] fails isinstance)
277+
Buggy total: ~0.03 instead of 0.23
278+
"""
279+
operations_and_results = [
280+
('example', ("content", 0.01, "gpt-4o-mini")),
281+
('generate', ("code", False, 0.05, "claude-sonnet-4-6")),
282+
('crash', (True, "code", "program", 2, 0.03, "gpt-4o-mini")),
283+
('test', ("tests", 0.02, "gpt-4o-mini", True)),
284+
('fix', (True, "fixed test", "fixed code", 1, 0.08, "claude-sonnet-4-6")),
285+
('verify', (True, "program", "code", 1, 0.04, "gpt-4o-mini")),
286+
]
287+
288+
expected_total = 0.01 + 0.05 + 0.03 + 0.02 + 0.08 + 0.04 # = 0.23
289+
290+
current_cost = 0.0
291+
for operation, result in operations_and_results:
292+
cost = _extract_cost_from_result(operation, result)
293+
current_cost += cost
294+
295+
assert current_cost == pytest.approx(expected_total), (
296+
f"Total cost should be ${expected_total:.2f} but got ${current_cost:.4f}. "
297+
f"generate/crash/fix/verify costs are lost due to wrong tuple index."
298+
)
299+
300+
301+
class TestOperationLogDecoratorBug:
302+
"""Test the identical bug fix in operation_log.py:339-340.
303+
304+
The log_operation decorator now uses the same _extract_cost_from_result and
305+
_extract_model_from_result helpers, fixing the operation-specific dispatch.
306+
"""
307+
308+
def test_generate_decorator_extraction(self):
309+
"""Decorator reads correct index for generate: cost at [2], model at [3]."""
310+
result = ("generated code", False, 0.0421, "claude-sonnet-4-6")
311+
312+
cost = _extract_cost_from_result('generate', result)
313+
model = _extract_model_from_result('generate', result)
314+
315+
assert cost == pytest.approx(0.0421), (
316+
f"Decorator cost for generate should be {result[2]} but got {cost}. "
317+
f"result[1] = {result[1]!r} (bool) was cast to float."
318+
)
319+
assert model == "claude-sonnet-4-6", (
320+
f"Decorator model for generate should be '{result[3]}' but got '{model}'."
321+
)
322+
323+
def test_fix_decorator_extraction(self):
324+
"""Decorator reads correct index for fix: cost at [4], model at [5]."""
325+
result = (True, "fixed test code", "fixed source code", 3, 0.0892, "claude-sonnet-4-6")
326+
327+
cost = _extract_cost_from_result('fix', result)
328+
model = _extract_model_from_result('fix', result)
329+
330+
assert cost == pytest.approx(0.0892), (
331+
f"Decorator cost for fix should be {result[4]} but got {cost}."
332+
)
333+
assert model == "claude-sonnet-4-6", (
334+
f"Decorator model for fix should be '{result[5]}' but got '{model}'."
335+
)

0 commit comments

Comments
 (0)