Skip to content

Commit 7b7ac0c

Browse files
author
Zvi Fried
committed
workflow fixes
1 parent b056d7d commit 7b7ac0c

20 files changed

+668
-56
lines changed

src/mcp_as_a_judge/models.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,51 @@ class ResearchValidationResponse(BaseModel):
109109
)
110110

111111

112+
class ResearchAspect(BaseModel):
113+
"""A major aspect that research must cover (LLM-extracted)."""
114+
115+
name: str = Field(description="Canonical aspect name relevant to the task")
116+
synonyms: list[str] = Field(
117+
default_factory=list,
118+
description="List of synonymous terms or phrases that indicate coverage",
119+
)
120+
required: bool = Field(
121+
default=True,
122+
description="Whether coverage of this aspect is required for approval",
123+
)
124+
category: str | None = Field(
125+
default=None,
126+
description="Optional category, e.g., 'protocol', 'framework', 'deployment'",
127+
)
128+
rationale: str = Field(
129+
default="",
130+
description="Why this aspect is required for this task",
131+
)
132+
133+
134+
class ResearchAspectsExtraction(BaseModel):
135+
"""LLM-extracted research aspects that should be covered."""
136+
137+
aspects: list[ResearchAspect] = Field(
138+
default_factory=list,
139+
description="List of aspects that research must cover for this task",
140+
)
141+
notes: str = Field(
142+
default="",
143+
description="Additional notes about coverage or prioritization",
144+
)
145+
146+
147+
class ResearchAspectsUserVars(BaseModel):
148+
"""Variables for research aspects extraction user prompt."""
149+
150+
task_title: str = Field(description="Title of the coding task")
151+
task_description: str = Field(description="Detailed description of the task")
152+
user_requirements: str = Field(description="User requirements for the task")
153+
plan: str = Field(description="Implementation plan text (can be brief)")
154+
design: str = Field(description="Design summary (can be brief)")
155+
156+
112157
class ResearchComplexityFactors(BaseModel):
113158
"""Analysis factors for determining research complexity."""
114159

@@ -265,7 +310,9 @@ class JudgeCodeChangeUserVars(BaseModel):
265310
)
266311
file_path: str = Field(description="Path to the file being changed")
267312
change_description: str = Field(description="Description of what the change does")
268-
code_change: str = Field(description="The actual code content being reviewed")
313+
code_change: str = Field(
314+
description="Unified Git diff patch representing the code changes under review"
315+
)
269316
context: str = Field(description="Additional context about the code change")
270317
conversation_history: list = Field(
271318
default_factory=list,

src/mcp_as_a_judge/models/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@
4545
"ResearchRequirementsAnalysisUserVars",
4646
"ResearchValidationResponse",
4747
"ResearchValidationUserVars",
48+
"ResearchAspect",
49+
"ResearchAspectsExtraction",
50+
"ResearchAspectsUserVars",
4851
"SystemVars",
4952
"TaskAnalysisResult",
5053
"TaskCompletionResult",
@@ -218,6 +221,9 @@ def _load_models_py() -> _Any | None:
218221
"JudgeCodingPlanUserVars",
219222
"ResearchValidationResponse",
220223
"ResearchValidationUserVars",
224+
"ResearchAspect",
225+
"ResearchAspectsExtraction",
226+
"ResearchAspectsUserVars",
221227
"WorkflowGuidanceUserVars",
222228
"DynamicSchemaUserVars",
223229
"ValidationErrorUserVars",

src/mcp_as_a_judge/models/enhanced_responses.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ class JudgeResponse(TrimmedBaseModel):
3434
)
3535
feedback: str = Field(description="Detailed feedback about the validation")
3636

37+
# Optional unified Git diff with suggested fixes or refinements
38+
suggested_diff: str | None = Field(
39+
default=None,
40+
description=(
41+
"Unified Git diff patch with suggested changes (optional). "
42+
"Provide when rejecting with concrete fixes or when proposing minor refinements."
43+
),
44+
)
45+
3746
current_task_metadata: TaskMetadata = Field(
3847
default_factory=lambda: TaskMetadata(
3948
title="Unknown Task",

src/mcp_as_a_judge/models/task_metadata.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,51 @@ class TaskMetadata(BaseModel):
161161

162162
# (No explicit decision ledger; decisions are handled via LLM-driven elicitation and conversation history)
163163

164+
# PROBLEM DOMAIN & REUSE/DEPENDENCIES PLAN - For enforcing domain focus and avoiding reinvention
165+
problem_domain: str = Field(
166+
default="",
167+
description="Concise statement of the problem domain and scope for this task",
168+
)
169+
problem_non_goals: list[str] = Field(
170+
default_factory=list,
171+
description="Explicit non-goals/boundaries to prevent scope creep and re-solving commodity concerns",
172+
)
173+
174+
class LibraryPlanItem(BaseModel):
175+
purpose: str = Field(
176+
description="Non-domain concern or integration point this library addresses"
177+
)
178+
selection: str = Field(
179+
description="Chosen library or internal utility (name and optional version)"
180+
)
181+
source: str = Field(
182+
description="Source of solution: 'internal' for repo utility, 'external' for well-known library, 'custom' for in-house code"
183+
)
184+
justification: str = Field(
185+
default="",
186+
description="One-line rationale for the selection and any trade-offs",
187+
)
188+
189+
library_plan: list[LibraryPlanItem] = Field(
190+
default_factory=list,
191+
description=(
192+
"Planned libraries/utilities per purpose; prefer internal reuse and well-known libraries; custom code only with justification"
193+
),
194+
)
195+
196+
class ReuseComponent(BaseModel):
197+
path: str = Field(description="Repository path to the reusable component")
198+
purpose: str = Field(
199+
default="",
200+
description="What part of the task this component will support",
201+
)
202+
notes: str = Field(default="", description="Any integration notes or caveats")
203+
204+
internal_reuse_components: list[ReuseComponent] = Field(
205+
default_factory=list,
206+
description="Existing repository components/utilities to reuse with paths and purposes",
207+
)
208+
164209
# RESEARCH TRACKING FIELDS - Added for workflow-driven research validation
165210
research_required: bool | None = Field(
166211
default=None,

src/mcp_as_a_judge/prompts/system/judge_code_change.md

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Software Engineering Judge - Code Review System Instructions
22

3-
You are an expert software engineering judge specializing in code review. Your role is to evaluate code changes and provide feedback on quality, security, and best practices.
3+
You are an expert software engineering judge specializing in code review. Your role is to evaluate code changes strictly based on the provided unified Git diff and provide precise, actionable feedback and, when needed, a corrected diff.
44

55
{% include 'shared/response_constraints.md' %}
66

@@ -12,6 +12,11 @@ You are an expert software engineering judge specializing in code review. Your r
1212
- Error handling and defensive programming
1313
- Testing and debugging strategies
1414

15+
## Input Requirements
16+
17+
- The `code_change` field MUST be a unified Git diff patch (e.g., contains `diff --git`, `---`, `+++`, and `@@` hunk headers).
18+
- If the input is not a diff, you MUST return `approved: false` with `required_improvements` that includes: "Provide a unified Git diff patch of the changes for review". Do not approve non-diff inputs and do not provide generic narrative approvals.
19+
1520
## Evaluation Criteria
1621

1722
Evaluate code content against the following comprehensive criteria:
@@ -76,10 +81,10 @@ Evaluate code content against the following comprehensive criteria:
7681

7782
### 7. Dependencies & Reuse
7883

79-
- Are third-party libraries used appropriately?
80-
- Is existing code reused where possible?
84+
- Are third-party libraries used appropriately and preferentially for commodity concerns?
85+
- Is existing code reused where possible (current repo > well-known libraries > custom code)?
8186
- Are new dependencies justified and well-vetted?
82-
- **Don't Reinvent the Wheel**: Are standard solutions used where appropriate?
87+
- MANDATORY: Do not reimplement solved/commodity areas without strong justification. Prefer integrating an internal utility or a well-known library; request changes when custom code replaces established solutions.
8388

8489
### 8. Maintainability & Evolution
8590

@@ -96,6 +101,7 @@ Evaluate code content against the following comprehensive criteria:
96101
- **Broken Windows Theory**: Focus on issues that will compound over time if left unfixed
97102
- **Context-Driven**: Consider complexity, timeline, and constraints when evaluating
98103
- **Constructive Feedback**: Provide actionable guidance for improvement
104+
- Library Preference: Prefer integrating existing internal components or well-known libraries over custom implementations. Flag and require changes when custom code replaces established solutions without justification.
99105

100106
### Human-in-the-Loop (HITL) Guidance
101107
- If foundational choices appear ambiguous, missing, or changed (framework/library, UI vs CLI, web vs desktop, API style, auth, hosting):
@@ -127,6 +133,7 @@ Evaluate code content against the following comprehensive criteria:
127133
- **Broken Windows**: Quality issues that will encourage more poor code
128134
- **Tight Coupling**: Code that makes future changes difficult
129135
- **Premature Optimization**: Complex optimizations without clear benefit
136+
- **Reinvented Wheels**: Custom implementations of common concerns where a well-known library or existing internal component should be used
130137

131138
## Response Requirements
132139

@@ -135,8 +142,14 @@ You must respond with a JSON object that matches this schema:
135142

136143
## Key Principles
137144

145+
- **REVIEW THE DIFF ONLY**: Base your analysis strictly on the provided unified diff. Do not infer unrelated parts of the codebase.
138146
- **PROVIDE ALL FEEDBACK AT ONCE**: Give comprehensive feedback in a single response covering all identified issues
139147
- If requiring revision, limit to 3-5 most critical issues
140148
- Remember: "Don't let perfect be the enemy of good enough"
141149
- Focus on what matters most for maintainable, working software
142150
- **Complete Analysis**: Ensure your evaluation covers SOLID principles, design patterns (when applicable), and all other criteria in one thorough review
151+
152+
### Suggested Fixes
153+
154+
- When you reject (`approved: false`), include a concise explanation in `feedback` and, if feasible, provide a corrected minimal patch in a unified Git diff format in the `suggested_diff` field.
155+
- When you approve (`approved: true`) and have minor optional improvements, you may include a non-blocking `suggested_diff` with minor refinements.

src/mcp_as_a_judge/prompts/system/judge_coding_plan.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ Evaluate submissions against the following comprehensive SWE best practices:
3434
- **DRY Principle**: Does it avoid duplication and promote reusability?
3535
- **Orthogonality**: Are components independent and loosely coupled?
3636

37+
### 1a. Problem Domain Focus & Library Plan — MANDATORY
38+
39+
- Problem Domain Statement: Provide a concise statement of the problem being solved, with explicit non-goals to prevent scope creep.
40+
- Solved Areas Boundary: Clearly mark commodity/non-domain concerns as “solved externally” unless a compelling justification exists.
41+
- Library Selection Map (Required Deliverable): For each non-domain concern, list the chosen internal utility or well-known library and its purpose, with a one-line justification. Preference order: existing repo utilities > well-known libraries > custom code (last resort, with justification).
42+
- Internal Reuse Map (Required Deliverable): Identify existing repository components/utilities to reuse with file paths.
43+
- Plans missing these deliverables must be rejected with required improvements.
44+
3745
### 2. Independent Research Types Evaluation
3846

3947
**🔍 External Research (ONLY evaluate if Status: REQUIRED):**
@@ -68,6 +76,13 @@ IMPORTANT applicability rule:
6876
- Does it avoid over-engineering or under-engineering?
6977
- **Reversibility**: Can decisions be easily changed if requirements evolve?
7078
- **Tracer Bullets**: Is there a plan for incremental development and validation?
79+
- Dependency Integration Plan: Are selected libraries integrated behind clear seams (adapters/ports) to keep the solution replaceable and testable?
80+
81+
Output mapping requirement: Populate these fields in current_task_metadata for downstream tools to consume:
82+
- current_task_metadata.problem_domain (string)
83+
- current_task_metadata.problem_non_goals (array of strings)
84+
- current_task_metadata.library_plan (array of objects: purpose, selection, source [internal|external|custom], justification)
85+
- current_task_metadata.internal_reuse_components (array of objects: path, purpose, notes)
7186

7287
### 4. Security & Robustness
7388

@@ -168,6 +183,7 @@ IMPORTANT applicability rule:
168183
- **STRONGLY PREFER**: Existing solutions (current repo > well-known libraries > in-house development)
169184
- **FLAG IMMEDIATELY**: Any attempt to build from scratch what already exists
170185
- **RESEARCH QUALITY**: Is research based on current repo state + user requirements + online investigation?
186+
- **MANDATORY DELIVERABLES**: Library Selection Map and Internal Reuse Map must be present and specific; reject if absent or superficial.
171187

172188
### 3. Ensure Generic Solutions
173189

@@ -239,3 +255,4 @@ You must respond with a JSON object that matches this schema:
239255
- Remember: "Perfect is the enemy of good enough"
240256
- Focus on what matters most for maintainable, working software
241257
- **Complete Analysis**: Ensure your evaluation covers SOLID principles, design patterns (when applicable), and all other criteria in one thorough review
258+
- **Enforcement**: Reject plans that do not include a clear Problem Domain Statement, Library Selection Map, and Internal Reuse Map.

src/mcp_as_a_judge/prompts/system/judge_testing_implementation.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22

33
You are an expert testing evaluation specialist responsible for comprehensively assessing test implementations for coding tasks. Your role is to ensure that tests are high-quality, comprehensive, and truly validate the implemented functionality.
44

5+
## Input Requirements
6+
7+
- You MUST be provided with real test evidence:
8+
- A non-empty list of `test_files` that were created/modified
9+
- `test_execution_results` containing raw test runner output (e.g., pytest/jest/mocha/go test/JUnit logs) with pass/fail counts
10+
- If evidence is missing or looks like a narrative summary instead of raw output, you MUST return `approved: false` and require the raw test output and file list.
11+
512
## Core Responsibilities
613

714
### 1. Test Quality Assessment
@@ -57,6 +64,12 @@ Provide your evaluation in the following JSON format:
5764
{{ response_schema }}
5865
```
5966

67+
### Evidence Validation
68+
69+
- If `test_files` is empty OR `test_execution_results` does not appear to be raw test output (no pass/fail counts, no standard runner markers), return `approved: false` with `required_improvements`:
70+
- "Provide raw test runner output including pass/fail summary"
71+
- "List the test files created/modified"
72+
6073
## Key Evaluation Points
6174

6275
### Test Coverage Analysis
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Research Aspects Extraction - System Instructions
2+
3+
You extract a clean, generic list of major aspects that research must cover for a development task.
4+
5+
Guidelines:
6+
- Identify systems, frameworks, protocols, integrations, deployment/ops, and security/auth topics implied by the requirements, plan, and design.
7+
- Use canonical, concise names; avoid vendor- or example-specific bias beyond what the task implies.
8+
- Provide helpful synonyms/aliases for each aspect to support coverage detection.
9+
- Mark aspects as required when they are critical to safely implementing the task.
10+
- Keep the output strictly within the provided JSON schema.
11+
12+
Output strictly matches the provided response_schema.
13+

src/mcp_as_a_judge/prompts/system/research_requirements_analysis.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ Always emphasize research quality over pure quantity:
8888
- Coverage of implementation details and edge cases
8989
- Multi-aspect coverage: Ensure the research plan explicitly maps to ALL major aspects implied by the user requirements (each referenced system, framework, protocol, integration), rather than focusing on a single subset.
9090

91+
### Library & Reuse Research (Strongly Encouraged / Often Required)
92+
- Identify well-known libraries or internal utilities for each non-domain concern relevant to the task.
93+
- Compare credible options when relevant and recommend one with justification.
94+
- Survey existing repository utilities/components for reuse and list candidates with file paths.
95+
9196
## Analysis Output Requirements
9297

9398
Provide structured analysis considering:

src/mcp_as_a_judge/prompts/system/research_validation.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Evaluate if the research is comprehensive enough and if the design is properly b
3232
- **RESEARCH INTEGRATION**: Are insights from current repo + online research properly incorporated into the approach?
3333
- **NO REINVENTING**: Does it avoid reinventing the wheel unnecessarily?
3434
- **JUSTIFICATION REQUIRED**: If proposing new development, is there clear justification why existing solutions won't work?
35+
- **LIBRARIES WIRED-IN**: Does the design show how chosen libraries or internal components will be integrated (adapters/ports, configuration, initialization)?
3536

3637
### 3. Research Quality - MANDATORY VALIDATION
3738

@@ -42,9 +43,14 @@ Evaluate if the research is comprehensive enough and if the design is properly b
4243
- **🌐 MANDATORY: Online Research URLs**: Are research URLs provided? Online research is MANDATORY.
4344
- **REJECT IF MISSING**: No URLs provided means no online research was performed - REJECT immediately
4445
- **ONLINE RESEARCH EVIDENCE**: Do URLs demonstrate actual online research into implementation approaches and existing libraries?
45-
- **EXISTING SOLUTIONS FOCUS**: Do URLs show research into current repo capabilities, well-known libraries, and best practices?
46-
- **FULL REQUIREMENTS COVERAGE**: Do the provided URLs collectively cover ALL major aspects implied by the user requirements (each named system, framework, protocol, integration), rather than focusing on a single subset?
47-
- **REJECT IMMEDIATELY**: Missing URLs, insufficient online research, or failure to investigate existing solutions first
46+
- **EXISTING SOLUTIONS FOCUS**: Do URLs show research into current repo capabilities, well-known libraries, and best practices?
47+
- **FULL REQUIREMENTS COVERAGE**: Do the provided URLs collectively cover ALL major aspects implied by the user requirements (each named system, framework, protocol, integration), rather than focusing on a single subset?
48+
- **REJECT IMMEDIATELY**: Missing URLs, insufficient online research, or failure to investigate existing solutions first
49+
50+
### 1a. Library Selection Evidence — REQUIRED WHEN APPLICABLE
51+
- Are specific libraries/frameworks identified for each non-domain concern with links to credible docs?
52+
- Is there a brief trade-off analysis where multiple mature options exist?
53+
- Is internal reuse considered with concrete file references where applicable?
4854

4955
## Response Requirements
5056

0 commit comments

Comments
 (0)