Skip to content

Commit ca7af3e

Browse files
refactoring
1 parent 4a059e1 commit ca7af3e

File tree

13 files changed

+7659
-285
lines changed

13 files changed

+7659
-285
lines changed

code2logic/benchmarks/common.py

Lines changed: 106 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -189,22 +189,66 @@ def get_async_reproduction_prompt(spec: str, fmt: str, file_name: str, with_test
189189

190190
def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language: str = "python") -> str:
191191
format_hints = {
192-
"json": "Parse the JSON structure and implement all classes and functions with exact signatures.",
192+
"json": """Parse the JSON structure carefully:
193+
- 'modules' array contains file-level info with 'classes' and 'functions'
194+
- Each class has 'name', 'bases', 'methods' with full signatures
195+
- Each function has 'name', 'params', 'returns', 'doc'
196+
- Implement ALL classes with their methods and ALL standalone functions
197+
- Use the 'doc' field to implement actual logic, not just stubs
198+
CRITICAL: Match every class/function name and signature exactly.""",
193199
"json_compact": "Parse the compact JSON and implement all elements with exact signatures.",
194-
"yaml": "Parse the YAML structure and implement all classes and functions with exact signatures.",
195-
"gherkin": """Parse Gherkin/BDD scenarios and implement them as working code:
196-
- Each Feature maps to a class or module
197-
- Each Scenario maps to a function
198-
- Given/When/Then steps describe the logic flow
199-
- Implement actual logic, not just stubs
200-
Focus on the described behavior and implement it directly.""",
201-
"markdown": "Parse embedded Gherkin (behaviors) and YAML (structures). Implement all described classes and functions.",
202-
"logicml": """Parse LogicML and generate VALID code:
203-
- 'sig:' lines describe function signatures (translate to the target language)
204-
- 'type: re-export' means this module primarily re-exports symbols
205-
- 'attrs:' = instance attributes to set in constructor
200+
"yaml": """Parse the YAML structure carefully:
201+
- Top-level keys describe modules with classes and functions
202+
- Each class has 'bases', 'properties', 'methods' with signatures
203+
- Each function has params, return type, and docstring/intent
204+
- Implement ALL classes, methods, and standalone functions
205+
- Use intent/docstring to write actual logic, not placeholders
206+
CRITICAL: Match every name and signature exactly as specified.""",
207+
"gherkin": """Parse Gherkin/BDD specification and reconstruct the ORIGINAL source code:
208+
- 'Feature:' = a class or module (use the name after Feature)
209+
- 'Scenario:' = a function or method to implement
210+
- 'Given' steps = setup / preconditions / imports needed
211+
- 'When' steps = the core action / logic to implement
212+
- 'Then' steps = expected outcomes / return values / assertions
213+
- 'And' continues the previous step type
214+
- '@tag' annotations may indicate decorators or categories
215+
216+
IMPORTANT RULES:
217+
1. Each Scenario becomes a real function with actual logic (NOT test code)
218+
2. Given/When/Then describe behavior, translate them to implementation
219+
3. Include all imports mentioned in Given steps
220+
4. Use type hints based on parameter descriptions
221+
5. Implement real logic based on When/Then steps, not just stubs
222+
6. If a Feature has multiple Scenarios, they are methods of the same class""",
223+
"markdown": """Parse the Markdown specification to reconstruct source code:
224+
- '## Module' or '### Class' headings define code structure
225+
- Embedded YAML blocks describe attributes, methods, signatures
226+
- Embedded Gherkin blocks describe behaviors to implement
227+
- Code blocks show example usage or signatures
228+
- Tables may list functions with their parameters and return types
229+
230+
IMPORTANT RULES:
231+
1. Extract class names, method signatures, and function signatures from headings and YAML
232+
2. Implement all listed methods with actual logic based on descriptions
233+
3. Include all imports mentioned anywhere in the document
234+
4. Use type hints from signatures or parameter descriptions
235+
5. Docstrings should come from the description text""",
236+
"logicml": """Parse LogicML and generate VALID, complete code:
237+
- 'module:' = file to generate
238+
- 'sig:' lines = EXACT function signatures (translate to target language)
239+
- 'does:' = function intent/docstring — use this to implement real logic
240+
- 'type: re-export' = module primarily re-exports symbols from imports
241+
- 'attrs:' = instance attributes to initialize in __init__/constructor
206242
- 'bases:' = parent classes to inherit from
207-
CRITICAL: Ensure valid syntax - balanced brackets, proper indentation, no undefined variables.""",
243+
- 'decorators:' = decorators to apply
244+
- 'calls:' = other functions this function calls (implement the call chain)
245+
- 'raises:' = exceptions this function may raise
246+
247+
CRITICAL RULES:
248+
1. Translate EVERY 'sig:' line into a real function with actual logic
249+
2. Use 'does:' text to implement meaningful function bodies
250+
3. Ensure valid syntax - balanced brackets, proper indentation
251+
4. Include ALL imports listed in the module""",
208252
"toon": """Parse TOON (Token-Oriented Object Notation) format carefully:
209253
210254
STRUCTURE:
@@ -222,19 +266,43 @@ def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language:
222266
- 'decorators: @property' = add @property decorator
223267
- 'decorators: @staticmethod|@cache' = multiple decorators
224268
225-
CRITICAL: Use imports[], function_docs, and exact signatures to reproduce code accurately.""",
226-
"csv": """Parse the CSV table where each row describes a code element:
227-
- Columns: path, type (class/method/function), name, signature, language, intent, category, domain, imports
228-
- 'method' rows belong to the class in the preceding 'class' row
229-
- Implement all elements with the exact signatures shown
230-
Generate complete code with all classes, methods, and functions.""",
231-
"function.toon": """Parse the function-logic TOON format:
232-
- 'modules[N]{path,lang,items}:' lists files
233-
- 'function_details:' contains per-module function listings
234-
- Each function has: line number, name, signature, description
235-
- 'ClassName.method_name' = method of that class
236-
- 'cc:N' after name = cyclomatic complexity
237-
Implement all listed functions with matching signatures and described behavior.""",
269+
CRITICAL RULES:
270+
1. Use imports[] to generate all import statements
271+
2. Use function_docs to write real function bodies (not stubs)
272+
3. Match exact signatures from sig: fields
273+
4. Include ALL classes with their methods and ALL standalone functions
274+
5. Preserve async functions (marked with 'async: true')""",
275+
"csv": """Parse the CSV table to reconstruct source code:
276+
- Columns: path, type, name, signature, language, intent, category, domain, imports
277+
- 'type=class' rows define classes (look at 'bases' if present)
278+
- 'type=method' rows are methods of the preceding class
279+
- 'type=function' rows are standalone functions
280+
- 'signature' column has the exact function signature to use
281+
- 'intent' column describes what the function does — use it to implement real logic
282+
- 'imports' column lists required imports
283+
284+
IMPORTANT RULES:
285+
1. Group methods under their parent class
286+
2. Include all imports from the 'imports' column
287+
3. Match signatures exactly as shown
288+
4. Use 'intent' to implement actual logic, not just stubs
289+
5. Add type hints based on signature information""",
290+
"function.toon": """Parse the function-logic TOON format to reconstruct source code:
291+
- 'modules[N]{path,lang,items}:' lists source files and their function count
292+
- 'function_details:' contains per-module function listings as tables
293+
- Table columns: line, name, sig[, does, decorators, calls, raises]
294+
- 'ClassName.method_name' = this is a method of ClassName (create the class)
295+
- '~function_name' = async function (add async keyword)
296+
- 'cc:N' suffix on name = cyclomatic complexity hint (more complex logic needed)
297+
- 'sig' column has exact signature: (params)->ReturnType
298+
299+
CRITICAL RULES:
300+
1. Create classes for any ClassName that appears as prefix in 'ClassName.method'
301+
2. Translate EVERY listed function into real code with actual logic
302+
3. Use 'does' column text to implement meaningful function bodies
303+
4. Match signatures EXACTLY from the 'sig' column
304+
5. Include imports needed for the types and calls referenced
305+
6. Preserve method grouping under their classes""",
238306
}
239307

240308
# Language-specific guidance appended to prompt
@@ -248,7 +316,7 @@ def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language:
248316
"sql": "Use standard SQL: CREATE TABLE/VIEW/FUNCTION, proper column types, constraints.",
249317
}
250318

251-
max_spec = 8000
319+
max_spec = 12000
252320
spec_truncated = spec[:max_spec] if len(spec) > max_spec else spec
253321

254322
language_norm = (language or "python").strip().lower()
@@ -267,15 +335,20 @@ def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language:
267335
lang_hint = lang_hints.get(language_norm, '')
268336
lang_hint_line = f"\n{lang_hint}" if lang_hint else ''
269337

270-
prompt = f"""Generate {lang_label} code from this {fmt.upper()} specification.
338+
prompt = f"""Generate complete {lang_label} source code from this {fmt.upper()} specification.
271339
{format_hints.get(fmt, '')}{lang_hint_line}
272340
341+
SPECIFICATION:
273342
{spec_truncated}
274343
275-
Requirements:
276-
- Complete, working {lang_label} code for {file_name}
277-
- Include imports and type hints
278-
- Implement all functions with actual logic
344+
REQUIREMENTS:
345+
- Output complete, working {lang_label} code for {file_name}
346+
- Include ALL imports at the top
347+
- Implement ALL classes, methods, and functions listed in the specification
348+
- Use type hints throughout
349+
- Write real logic based on descriptions/intents, NOT placeholder stubs
350+
- Match function signatures EXACTLY as specified
351+
- Output ONLY the code, no explanations
279352
280353
```{language_norm}
281354
"""

code2logic/benchmarks/results.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ class BenchmarkResult:
118118
avg_similarity: float = 0.0
119119
syntax_ok_rate: float = 0.0
120120
runs_ok_rate: float = 0.0
121+
failure_rate: float = 0.0
121122

122123
# Best format (for format comparisons)
123124
best_format: str = ""
@@ -144,17 +145,18 @@ def __post_init__(self):
144145

145146
def calculate_aggregates(self):
146147
"""Calculate aggregate metrics from detailed results."""
147-
# File results
148+
# File results – include ALL scores (zeros count as failures)
148149
if self.file_results:
149-
scores = [r.score for r in self.file_results if r.score > 0]
150-
self.avg_score = sum(scores) / len(scores) if scores else 0
150+
all_scores = [r.score for r in self.file_results]
151+
self.avg_score = sum(all_scores) / len(all_scores) if all_scores else 0
152+
self.failure_rate = sum(1 for s in all_scores if s == 0) / len(all_scores) * 100
151153
self.syntax_ok_rate = sum(1 for r in self.file_results if r.syntax_ok) / len(self.file_results) * 100
152154
self.runs_ok_rate = sum(1 for r in self.file_results if r.runs_ok) / len(self.file_results) * 100
153155

154-
# Function results
156+
# Function results – include ALL similarities
155157
if self.function_results:
156-
sims = [r.similarity for r in self.function_results if r.similarity > 0]
157-
self.avg_similarity = sum(sims) / len(sims) if sims else 0
158+
all_sims = [r.similarity for r in self.function_results]
159+
self.avg_similarity = sum(all_sims) / len(all_sims) if all_sims else 0
158160

159161
# Format results
160162
if self.format_results:
@@ -187,10 +189,24 @@ def load(cls, path: str) -> 'BenchmarkResult':
187189
"""Load result from JSON file."""
188190
data = json.loads(Path(path).read_text())
189191
# Reconstruct nested objects
190-
file_results = [FileResult(**r) for r in data.pop('file_results', [])]
192+
raw_file_results = data.pop('file_results', [])
193+
file_results = []
194+
for r in raw_file_results:
195+
fmt_results_raw = r.pop('format_results', {})
196+
fr = FileResult(**r)
197+
fr.format_results = {
198+
k: FormatResult(**v) if isinstance(v, dict) else v
199+
for k, v in fmt_results_raw.items()
200+
}
201+
file_results.append(fr)
191202
function_results = [FunctionResult(**r) for r in data.pop('function_results', [])]
192203
format_results = [FormatResult(**r) for r in data.pop('format_results', [])]
193204

205+
# Remove unknown fields that may not be in the dataclass
206+
import dataclasses
207+
known_fields = {f.name for f in dataclasses.fields(cls)}
208+
data = {k: v for k, v in data.items() if k in known_fields}
209+
194210
result = cls(**data)
195211
result.file_results = file_results
196212
result.function_results = function_results

code2logic/benchmarks/runner.py

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -147,20 +147,32 @@ def _structural_score(original: str, generated: str, language: str) -> float:
147147
if not o:
148148
return 0.0
149149
keys = list(o.keys())
150-
matches = 0
150+
total = 0.0
151151
for k in keys:
152-
if o.get(k, 0) == g.get(k, 0):
153-
matches += 1
154-
return matches / max(len(keys), 1) * 100
152+
ov = o.get(k, 0)
153+
gv = g.get(k, 0)
154+
if ov == 0 and gv == 0:
155+
total += 1.0
156+
elif max(ov, gv) > 0:
157+
total += min(ov, gv) / max(ov, gv)
158+
return total / max(len(keys), 1) * 100
155159

156160

157161
def _extract_code(response: str) -> str:
158162
"""Extract code from LLM response."""
159163
if not response:
160164
return ""
161165

162-
# Try to find code block
163-
for marker in ['```python', '```py', '```']:
166+
# Try to find code block — check language-specific markers first, then generic
167+
markers = [
168+
'```python', '```py',
169+
'```javascript', '```js', '```typescript', '```ts',
170+
'```go', '```rust', '```rs',
171+
'```java', '```csharp', '```cs', '```c#',
172+
'```sql',
173+
'```',
174+
]
175+
for marker in markers:
164176
if marker in response:
165177
start = response.find(marker) + len(marker)
166178
if start < len(response) and response[start] == '\n':
@@ -469,12 +481,12 @@ def run_format_benchmark(
469481

470482
result.total_time = time.time() - start_time
471483

472-
# Calculate format aggregates
484+
# Calculate format aggregates – include ALL scores (zeros = failures)
473485
for fmt in formats:
474486
scores = [
475487
fr.format_results[fmt].score
476488
for fr in result.file_results
477-
if fmt in fr.format_results and fr.format_results[fmt].score > 0
489+
if fmt in fr.format_results
478490
]
479491
if scores:
480492
result.format_scores[fmt] = sum(scores) / len(scores)
@@ -762,24 +774,34 @@ def _test_function(
762774

763775
result.original_code = '\n'.join(lines[start:end])
764776

765-
# Create spec
777+
# Create spec with richer context
778+
calls_str = ', '.join(getattr(func, 'calls', []) or []) or 'None'
779+
raises_str = ', '.join(getattr(func, 'raises', []) or []) or 'None'
780+
cc = getattr(func, 'complexity', 1) or 1
766781
spec = f"""Function: {func.name}
767782
Language: {language}
768783
Signature: {func.name}({', '.join(func.params)}) -> {func.return_type or 'None'}
769784
Description: {func.intent or func.docstring or 'No description'}
770785
Is Async: {func.is_async}
771786
Decorators: {', '.join(func.decorators) if func.decorators else 'None'}
787+
Calls: {calls_str}
788+
Raises: {raises_str}
789+
Complexity: {cc}
772790
Lines: {func.lines}
773791
"""
774792

775-
prompt = f"""Generate ONLY the function code based on this specification:
793+
prompt = f"""Generate ONLY the complete function code based on this specification:
776794
777795
{spec}
778796
779-
Requirements:
780-
- Generate complete, working {language} function
781-
- Match the signature exactly
782-
- Output ONLY the function code
797+
REQUIREMENTS:
798+
- Generate a complete, working {language} function with REAL logic (not a stub)
799+
- Match the signature EXACTLY: {func.name}({', '.join(func.params)}) -> {func.return_type or 'None'}
800+
- Use the Description to implement actual behavior
801+
- Include decorators if specified
802+
- The function should be ~{func.lines} lines long
803+
- Include proper error handling if Raises is specified
804+
- Output ONLY the function code, no explanations
783805
784806
```{language}
785807
"""
@@ -793,7 +815,7 @@ def _test_function(
793815
result.gen_time = 0.0
794816
else:
795817
start_time = time.time()
796-
response = client.generate(prompt, max_tokens=2000)
818+
response = client.generate(prompt, max_tokens=3000)
797819
result.gen_time = time.time() - start_time
798820
result.reproduced_code = _extract_code(response)
799821

@@ -942,17 +964,23 @@ def run_project_benchmark(
942964

943965
result.total_time = time.time() - start_time
944966

945-
# Calculate format aggregates
967+
# Calculate format aggregates – include ALL scores (zeros = failures)
946968
for fmt in formats:
947969
scores = []
948970
for fr in result.file_results:
949971
if fmt in fr.format_results:
950-
score = fr.format_results[fmt].score
951-
if score > 0:
952-
scores.append(score)
972+
scores.append(fr.format_results[fmt].score)
953973
if scores:
954974
result.format_scores[fmt] = sum(scores) / len(scores)
955975

976+
# Recalculate each file's score as average across all its formats
977+
for fr in result.file_results:
978+
if fr.format_results:
979+
fmt_scores = [r.score for r in fr.format_results.values()]
980+
fr.score = sum(fmt_scores) / len(fmt_scores)
981+
fr.syntax_ok = all(r.syntax_ok for r in fr.format_results.values())
982+
fr.runs_ok = any(r.runs_ok for r in fr.format_results.values())
983+
956984
result.calculate_aggregates()
957985

958986
return result

code2logic/cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,13 @@ def _maybe_print_pretty_help() -> bool:
673673
action='store_true',
674674
help='Include the does/intent column in function-logic TOON output. Without this flag, the does column is omitted to save tokens.'
675675
)
676+
parser.add_argument(
677+
'--function-logic-context',
678+
choices=['none', 'minimal', 'full'],
679+
default='none',
680+
dest='function_logic_context',
681+
help='Structural context in function-logic TOON: none (flat list), minimal (class headers with bases), full (classes + properties + imports). Default: none.'
682+
)
676683
parser.add_argument(
677684
'--no-install',
678685
action='store_true',

0 commit comments

Comments
 (0)