semcod
diff --git a/‎code2logic/benchmarks/common.py‎
Lines changed: 106 additions & 33 deletions b/‎code2logic/benchmarks/common.py‎
Lines changed: 106 additions & 33 deletions
diff --git a/‎code2logic/benchmarks/results.py‎
Lines changed: 23 additions & 7 deletions b/‎code2logic/benchmarks/results.py‎
Lines changed: 23 additions & 7 deletions
diff --git a/‎code2logic/benchmarks/runner.py‎
Lines changed: 47 additions & 19 deletions b/‎code2logic/benchmarks/runner.py‎
Lines changed: 47 additions & 19 deletions
diff --git a/‎code2logic/cli.py‎
Lines changed: 7 additions & 0 deletions b/‎code2logic/cli.py‎
Lines changed: 7 additions & 0 deletions
@@ -189,22 +189,66 @@ def get_async_reproduction_prompt(spec: str, fmt: str, file_name: str, with_test
 
 def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language: str = "python") -> str:
     format_hints = {
-        "json": "Parse the JSON structure and implement all classes and functions with exact signatures.",
+        "json": """Parse the JSON structure carefully:
+- 'modules' array contains file-level info with 'classes' and 'functions'
+- Each class has 'name', 'bases', 'methods' with full signatures
+- Each function has 'name', 'params', 'returns', 'doc'
+- Implement ALL classes with their methods and ALL standalone functions
+- Use the 'doc' field to implement actual logic, not just stubs
+CRITICAL: Match every class/function name and signature exactly.""",
         "json_compact": "Parse the compact JSON and implement all elements with exact signatures.",
-        "yaml": "Parse the YAML structure and implement all classes and functions with exact signatures.",
-        "gherkin": """Parse Gherkin/BDD scenarios and implement them as working code:
-- Each Feature maps to a class or module
-- Each Scenario maps to a function
-- Given/When/Then steps describe the logic flow
-- Implement actual logic, not just stubs
-Focus on the described behavior and implement it directly.""",
-        "markdown": "Parse embedded Gherkin (behaviors) and YAML (structures). Implement all described classes and functions.",
-        "logicml": """Parse LogicML and generate VALID code:
-- 'sig:' lines describe function signatures (translate to the target language)
-- 'type: re-export' means this module primarily re-exports symbols
-- 'attrs:' = instance attributes to set in constructor
+        "yaml": """Parse the YAML structure carefully:
+- Top-level keys describe modules with classes and functions
+- Each class has 'bases', 'properties', 'methods' with signatures
+- Each function has params, return type, and docstring/intent
+- Implement ALL classes, methods, and standalone functions
+- Use intent/docstring to write actual logic, not placeholders
+CRITICAL: Match every name and signature exactly as specified.""",
+        "gherkin": """Parse Gherkin/BDD specification and reconstruct the ORIGINAL source code:
+- 'Feature:' = a class or module (use the name after Feature)
+- 'Scenario:' = a function or method to implement
+- 'Given' steps = setup / preconditions / imports needed
+- 'When' steps = the core action / logic to implement
+- 'Then' steps = expected outcomes / return values / assertions
+- 'And' continues the previous step type
+- '@tag' annotations may indicate decorators or categories
+
+IMPORTANT RULES:
+1. Each Scenario becomes a real function with actual logic (NOT test code)
+2. Given/When/Then describe behavior, translate them to implementation
+3. Include all imports mentioned in Given steps
+4. Use type hints based on parameter descriptions
+5. Implement real logic based on When/Then steps, not just stubs
+6. If a Feature has multiple Scenarios, they are methods of the same class""",
+        "markdown": """Parse the Markdown specification to reconstruct source code:
+- '## Module' or '### Class' headings define code structure
+- Embedded YAML blocks describe attributes, methods, signatures
+- Embedded Gherkin blocks describe behaviors to implement
+- Code blocks show example usage or signatures
+- Tables may list functions with their parameters and return types
+
+IMPORTANT RULES:
+1. Extract class names, method signatures, and function signatures from headings and YAML
+2. Implement all listed methods with actual logic based on descriptions
+3. Include all imports mentioned anywhere in the document
+4. Use type hints from signatures or parameter descriptions
+5. Docstrings should come from the description text""",
+        "logicml": """Parse LogicML and generate VALID, complete code:
+- 'module:' = file to generate
+- 'sig:' lines = EXACT function signatures (translate to target language)
+- 'does:' = function intent/docstring — use this to implement real logic
+- 'type: re-export' = module primarily re-exports symbols from imports
+- 'attrs:' = instance attributes to initialize in __init__/constructor
 - 'bases:' = parent classes to inherit from
-CRITICAL: Ensure valid syntax - balanced brackets, proper indentation, no undefined variables.""",
+- 'decorators:' = decorators to apply
+- 'calls:' = other functions this function calls (implement the call chain)
+- 'raises:' = exceptions this function may raise
+
+CRITICAL RULES:
+1. Translate EVERY 'sig:' line into a real function with actual logic
+2. Use 'does:' text to implement meaningful function bodies
+3. Ensure valid syntax - balanced brackets, proper indentation
+4. Include ALL imports listed in the module""",
         "toon": """Parse TOON (Token-Oriented Object Notation) format carefully:
 
 STRUCTURE:
@@ -222,19 +266,43 @@ def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language:
 - 'decorators: @property' = add @property decorator
 - 'decorators: @staticmethod|@cache' = multiple decorators
 
-CRITICAL: Use imports[], function_docs, and exact signatures to reproduce code accurately.""",
-        "csv": """Parse the CSV table where each row describes a code element:
-- Columns: path, type (class/method/function), name, signature, language, intent, category, domain, imports
-- 'method' rows belong to the class in the preceding 'class' row
-- Implement all elements with the exact signatures shown
-Generate complete code with all classes, methods, and functions.""",
-        "function.toon": """Parse the function-logic TOON format:
-- 'modules[N]{path,lang,items}:' lists files
-- 'function_details:' contains per-module function listings
-- Each function has: line number, name, signature, description
-- 'ClassName.method_name' = method of that class
-- 'cc:N' after name = cyclomatic complexity
-Implement all listed functions with matching signatures and described behavior.""",
+CRITICAL RULES:
+1. Use imports[] to generate all import statements
+2. Use function_docs to write real function bodies (not stubs)
+3. Match exact signatures from sig: fields
+4. Include ALL classes with their methods and ALL standalone functions
+5. Preserve async functions (marked with 'async: true')""",
+        "csv": """Parse the CSV table to reconstruct source code:
+- Columns: path, type, name, signature, language, intent, category, domain, imports
+- 'type=class' rows define classes (look at 'bases' if present)
+- 'type=method' rows are methods of the preceding class
+- 'type=function' rows are standalone functions
+- 'signature' column has the exact function signature to use
+- 'intent' column describes what the function does — use it to implement real logic
+- 'imports' column lists required imports
+
+IMPORTANT RULES:
+1. Group methods under their parent class
+2. Include all imports from the 'imports' column
+3. Match signatures exactly as shown
+4. Use 'intent' to implement actual logic, not just stubs
+5. Add type hints based on signature information""",
+        "function.toon": """Parse the function-logic TOON format to reconstruct source code:
+- 'modules[N]{path,lang,items}:' lists source files and their function count
+- 'function_details:' contains per-module function listings as tables
+- Table columns: line, name, sig[, does, decorators, calls, raises]
+- 'ClassName.method_name' = this is a method of ClassName (create the class)
+- '~function_name' = async function (add async keyword)
+- 'cc:N' suffix on name = cyclomatic complexity hint (more complex logic needed)
+- 'sig' column has exact signature: (params)->ReturnType
+
+CRITICAL RULES:
+1. Create classes for any ClassName that appears as prefix in 'ClassName.method'
+2. Translate EVERY listed function into real code with actual logic
+3. Use 'does' column text to implement meaningful function bodies
+4. Match signatures EXACTLY from the 'sig' column
+5. Include imports needed for the types and calls referenced
+6. Preserve method grouping under their classes""",
     }
 
     # Language-specific guidance appended to prompt
@@ -248,7 +316,7 @@ def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language:
         "sql": "Use standard SQL: CREATE TABLE/VIEW/FUNCTION, proper column types, constraints.",
     }
 
-    max_spec = 8000
+    max_spec = 12000
     spec_truncated = spec[:max_spec] if len(spec) > max_spec else spec
 
     language_norm = (language or "python").strip().lower()
@@ -267,15 +335,20 @@ def get_token_reproduction_prompt(spec: str, fmt: str, file_name: str, language:
     lang_hint = lang_hints.get(language_norm, '')
     lang_hint_line = f"\n{lang_hint}" if lang_hint else ''
 
-    prompt = f"""Generate {lang_label} code from this {fmt.upper()} specification.
+    prompt = f"""Generate complete {lang_label} source code from this {fmt.upper()} specification.
 {format_hints.get(fmt, '')}{lang_hint_line}
 
+SPECIFICATION:
 {spec_truncated}
 
-Requirements:
-- Complete, working {lang_label} code for {file_name}
-- Include imports and type hints
-- Implement all functions with actual logic
+REQUIREMENTS:
+- Output complete, working {lang_label} code for {file_name}
+- Include ALL imports at the top
+- Implement ALL classes, methods, and functions listed in the specification
+- Use type hints throughout
+- Write real logic based on descriptions/intents, NOT placeholder stubs
+- Match function signatures EXACTLY as specified
+- Output ONLY the code, no explanations
 
 ```{language_norm}
 """
 
@@ -118,6 +118,7 @@ class BenchmarkResult:
     avg_similarity: float = 0.0
     syntax_ok_rate: float = 0.0
     runs_ok_rate: float = 0.0
+    failure_rate: float = 0.0
 
     # Best format (for format comparisons)
     best_format: str = ""
@@ -144,17 +145,18 @@ def __post_init__(self):
 
     def calculate_aggregates(self):
         """Calculate aggregate metrics from detailed results."""
-        # File results
+        # File results – include ALL scores (zeros count as failures)
         if self.file_results:
-            scores = [r.score for r in self.file_results if r.score > 0]
-            self.avg_score = sum(scores) / len(scores) if scores else 0
+            all_scores = [r.score for r in self.file_results]
+            self.avg_score = sum(all_scores) / len(all_scores) if all_scores else 0
+            self.failure_rate = sum(1 for s in all_scores if s == 0) / len(all_scores) * 100
             self.syntax_ok_rate = sum(1 for r in self.file_results if r.syntax_ok) / len(self.file_results) * 100
             self.runs_ok_rate = sum(1 for r in self.file_results if r.runs_ok) / len(self.file_results) * 100
 
-        # Function results
+        # Function results – include ALL similarities
         if self.function_results:
-            sims = [r.similarity for r in self.function_results if r.similarity > 0]
-            self.avg_similarity = sum(sims) / len(sims) if sims else 0
+            all_sims = [r.similarity for r in self.function_results]
+            self.avg_similarity = sum(all_sims) / len(all_sims) if all_sims else 0
 
         # Format results
         if self.format_results:
@@ -187,10 +189,24 @@ def load(cls, path: str) -> 'BenchmarkResult':
         """Load result from JSON file."""
         data = json.loads(Path(path).read_text())
         # Reconstruct nested objects
-        file_results = [FileResult(**r) for r in data.pop('file_results', [])]
+        raw_file_results = data.pop('file_results', [])
+        file_results = []
+        for r in raw_file_results:
+            fmt_results_raw = r.pop('format_results', {})
+            fr = FileResult(**r)
+            fr.format_results = {
+                k: FormatResult(**v) if isinstance(v, dict) else v
+                for k, v in fmt_results_raw.items()
+            }
+            file_results.append(fr)
         function_results = [FunctionResult(**r) for r in data.pop('function_results', [])]
         format_results = [FormatResult(**r) for r in data.pop('format_results', [])]
 
+        # Remove unknown fields that may not be in the dataclass
+        import dataclasses
+        known_fields = {f.name for f in dataclasses.fields(cls)}
+        data = {k: v for k, v in data.items() if k in known_fields}
+
         result = cls(**data)
         result.file_results = file_results
         result.function_results = function_results
 
@@ -147,20 +147,32 @@ def _structural_score(original: str, generated: str, language: str) -> float:
     if not o:
         return 0.0
     keys = list(o.keys())
-    matches = 0
+    total = 0.0
     for k in keys:
-        if o.get(k, 0) == g.get(k, 0):
-            matches += 1
-    return matches / max(len(keys), 1) * 100
+        ov = o.get(k, 0)
+        gv = g.get(k, 0)
+        if ov == 0 and gv == 0:
+            total += 1.0
+        elif max(ov, gv) > 0:
+            total += min(ov, gv) / max(ov, gv)
+    return total / max(len(keys), 1) * 100
 
 
 def _extract_code(response: str) -> str:
     """Extract code from LLM response."""
     if not response:
         return ""
 
-    # Try to find code block
-    for marker in ['```python', '```py', '```']:
+    # Try to find code block — check language-specific markers first, then generic
+    markers = [
+        '```python', '```py',
+        '```javascript', '```js', '```typescript', '```ts',
+        '```go', '```rust', '```rs',
+        '```java', '```csharp', '```cs', '```c#',
+        '```sql',
+        '```',
+    ]
+    for marker in markers:
         if marker in response:
             start = response.find(marker) + len(marker)
             if start < len(response) and response[start] == '\n':
@@ -469,12 +481,12 @@ def run_format_benchmark(
 
         result.total_time = time.time() - start_time
 
-        # Calculate format aggregates
+        # Calculate format aggregates – include ALL scores (zeros = failures)
         for fmt in formats:
             scores = [
                 fr.format_results[fmt].score
                 for fr in result.file_results
-                if fmt in fr.format_results and fr.format_results[fmt].score > 0
+                if fmt in fr.format_results
             ]
             if scores:
                 result.format_scores[fmt] = sum(scores) / len(scores)
@@ -762,24 +774,34 @@ def _test_function(
 
             result.original_code = '\n'.join(lines[start:end])
 
-            # Create spec
+            # Create spec with richer context
+            calls_str = ', '.join(getattr(func, 'calls', []) or []) or 'None'
+            raises_str = ', '.join(getattr(func, 'raises', []) or []) or 'None'
+            cc = getattr(func, 'complexity', 1) or 1
             spec = f"""Function: {func.name}
 Language: {language}
 Signature: {func.name}({', '.join(func.params)}) -> {func.return_type or 'None'}
 Description: {func.intent or func.docstring or 'No description'}
 Is Async: {func.is_async}
 Decorators: {', '.join(func.decorators) if func.decorators else 'None'}
+Calls: {calls_str}
+Raises: {raises_str}
+Complexity: {cc}
 Lines: {func.lines}
 """
 
-            prompt = f"""Generate ONLY the function code based on this specification:
+            prompt = f"""Generate ONLY the complete function code based on this specification:
 
 {spec}
 
-Requirements:
-- Generate complete, working {language} function
-- Match the signature exactly
-- Output ONLY the function code
+REQUIREMENTS:
+- Generate a complete, working {language} function with REAL logic (not a stub)
+- Match the signature EXACTLY: {func.name}({', '.join(func.params)}) -> {func.return_type or 'None'}
+- Use the Description to implement actual behavior
+- Include decorators if specified
+- The function should be ~{func.lines} lines long
+- Include proper error handling if Raises is specified
+- Output ONLY the function code, no explanations
 
 ```{language}
 """
@@ -793,7 +815,7 @@ def _test_function(
                 result.gen_time = 0.0
             else:
                 start_time = time.time()
-                response = client.generate(prompt, max_tokens=2000)
+                response = client.generate(prompt, max_tokens=3000)
                 result.gen_time = time.time() - start_time
                 result.reproduced_code = _extract_code(response)
 
@@ -942,17 +964,23 @@ def run_project_benchmark(
 
         result.total_time = time.time() - start_time
 
-        # Calculate format aggregates
+        # Calculate format aggregates – include ALL scores (zeros = failures)
         for fmt in formats:
             scores = []
             for fr in result.file_results:
                 if fmt in fr.format_results:
-                    score = fr.format_results[fmt].score
-                    if score > 0:
-                        scores.append(score)
+                    scores.append(fr.format_results[fmt].score)
             if scores:
                 result.format_scores[fmt] = sum(scores) / len(scores)
 
+        # Recalculate each file's score as average across all its formats
+        for fr in result.file_results:
+            if fr.format_results:
+                fmt_scores = [r.score for r in fr.format_results.values()]
+                fr.score = sum(fmt_scores) / len(fmt_scores)
+                fr.syntax_ok = all(r.syntax_ok for r in fr.format_results.values())
+                fr.runs_ok = any(r.runs_ok for r in fr.format_results.values())
+
         result.calculate_aggregates()
 
         return result
 
@@ -673,6 +673,13 @@ def _maybe_print_pretty_help() -> bool:
         action='store_true',
         help='Include the does/intent column in function-logic TOON output. Without this flag, the does column is omitted to save tokens.'
     )
+    parser.add_argument(
+        '--function-logic-context',
+        choices=['none', 'minimal', 'full'],
+        default='none',
+        dest='function_logic_context',
+        help='Structural context in function-logic TOON: none (flat list), minimal (class headers with bases), full (classes + properties + imports). Default: none.'
+    )
     parser.add_argument(
         '--no-install',
         action='store_true',