OtherVibes
diff --git a/‎src/mcp_as_a_judge/models.py‎
Lines changed: 48 additions & 1 deletion b/‎src/mcp_as_a_judge/models.py‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎src/mcp_as_a_judge/models/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎src/mcp_as_a_judge/models/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/models/enhanced_responses.py‎
Lines changed: 9 additions & 0 deletions b/‎src/mcp_as_a_judge/models/enhanced_responses.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/models/task_metadata.py‎
Lines changed: 45 additions & 0 deletions b/‎src/mcp_as_a_judge/models/task_metadata.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/prompts/system/judge_code_change.md‎
Lines changed: 17 additions & 4 deletions b/‎src/mcp_as_a_judge/prompts/system/judge_code_change.md‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎src/mcp_as_a_judge/prompts/system/judge_coding_plan.md‎
Lines changed: 17 additions & 0 deletions b/‎src/mcp_as_a_judge/prompts/system/judge_coding_plan.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/prompts/system/judge_testing_implementation.md‎
Lines changed: 13 additions & 0 deletions b/‎src/mcp_as_a_judge/prompts/system/judge_testing_implementation.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/prompts/system/research_aspects.md‎
Lines changed: 13 additions & 0 deletions b/‎src/mcp_as_a_judge/prompts/system/research_aspects.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/prompts/system/research_requirements_analysis.md‎
Lines changed: 5 additions & 0 deletions b/‎src/mcp_as_a_judge/prompts/system/research_requirements_analysis.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/mcp_as_a_judge/prompts/system/research_validation.md‎
Lines changed: 9 additions & 3 deletions b/‎src/mcp_as_a_judge/prompts/system/research_validation.md‎
Lines changed: 9 additions & 3 deletions
@@ -109,6 +109,51 @@ class ResearchValidationResponse(BaseModel):
     )
 
 
+class ResearchAspect(BaseModel):
+    """A major aspect that research must cover (LLM-extracted)."""
+
+    name: str = Field(description="Canonical aspect name relevant to the task")
+    synonyms: list[str] = Field(
+        default_factory=list,
+        description="List of synonymous terms or phrases that indicate coverage",
+    )
+    required: bool = Field(
+        default=True,
+        description="Whether coverage of this aspect is required for approval",
+    )
+    category: str | None = Field(
+        default=None,
+        description="Optional category, e.g., 'protocol', 'framework', 'deployment'",
+    )
+    rationale: str = Field(
+        default="",
+        description="Why this aspect is required for this task",
+    )
+
+
+class ResearchAspectsExtraction(BaseModel):
+    """LLM-extracted research aspects that should be covered."""
+
+    aspects: list[ResearchAspect] = Field(
+        default_factory=list,
+        description="List of aspects that research must cover for this task",
+    )
+    notes: str = Field(
+        default="",
+        description="Additional notes about coverage or prioritization",
+    )
+
+
+class ResearchAspectsUserVars(BaseModel):
+    """Variables for research aspects extraction user prompt."""
+
+    task_title: str = Field(description="Title of the coding task")
+    task_description: str = Field(description="Detailed description of the task")
+    user_requirements: str = Field(description="User requirements for the task")
+    plan: str = Field(description="Implementation plan text (can be brief)")
+    design: str = Field(description="Design summary (can be brief)")
+
+
 class ResearchComplexityFactors(BaseModel):
     """Analysis factors for determining research complexity."""
 
@@ -265,7 +310,9 @@ class JudgeCodeChangeUserVars(BaseModel):
     )
     file_path: str = Field(description="Path to the file being changed")
     change_description: str = Field(description="Description of what the change does")
-    code_change: str = Field(description="The actual code content being reviewed")
+    code_change: str = Field(
+        description="Unified Git diff patch representing the code changes under review"
+    )
     context: str = Field(description="Additional context about the code change")
     conversation_history: list = Field(
         default_factory=list,
 
@@ -45,6 +45,9 @@
     "ResearchRequirementsAnalysisUserVars",
     "ResearchValidationResponse",
     "ResearchValidationUserVars",
+    "ResearchAspect",
+    "ResearchAspectsExtraction",
+    "ResearchAspectsUserVars",
     "SystemVars",
     "TaskAnalysisResult",
     "TaskCompletionResult",
@@ -218,6 +221,9 @@ def _load_models_py() -> _Any | None:
     "JudgeCodingPlanUserVars",
     "ResearchValidationResponse",
     "ResearchValidationUserVars",
+    "ResearchAspect",
+    "ResearchAspectsExtraction",
+    "ResearchAspectsUserVars",
     "WorkflowGuidanceUserVars",
     "DynamicSchemaUserVars",
     "ValidationErrorUserVars",
 
@@ -34,6 +34,15 @@ class JudgeResponse(TrimmedBaseModel):
     )
     feedback: str = Field(description="Detailed feedback about the validation")
 
+    # Optional unified Git diff with suggested fixes or refinements
+    suggested_diff: str | None = Field(
+        default=None,
+        description=(
+            "Unified Git diff patch with suggested changes (optional). "
+            "Provide when rejecting with concrete fixes or when proposing minor refinements."
+        ),
+    )
+
     current_task_metadata: TaskMetadata = Field(
         default_factory=lambda: TaskMetadata(
             title="Unknown Task",
 
@@ -161,6 +161,51 @@ class TaskMetadata(BaseModel):
 
     # (No explicit decision ledger; decisions are handled via LLM-driven elicitation and conversation history)
 
+    # PROBLEM DOMAIN & REUSE/DEPENDENCIES PLAN - For enforcing domain focus and avoiding reinvention
+    problem_domain: str = Field(
+        default="",
+        description="Concise statement of the problem domain and scope for this task",
+    )
+    problem_non_goals: list[str] = Field(
+        default_factory=list,
+        description="Explicit non-goals/boundaries to prevent scope creep and re-solving commodity concerns",
+    )
+
+    class LibraryPlanItem(BaseModel):
+        purpose: str = Field(
+            description="Non-domain concern or integration point this library addresses"
+        )
+        selection: str = Field(
+            description="Chosen library or internal utility (name and optional version)"
+        )
+        source: str = Field(
+            description="Source of solution: 'internal' for repo utility, 'external' for well-known library, 'custom' for in-house code"
+        )
+        justification: str = Field(
+            default="",
+            description="One-line rationale for the selection and any trade-offs",
+        )
+
+    library_plan: list[LibraryPlanItem] = Field(
+        default_factory=list,
+        description=(
+            "Planned libraries/utilities per purpose; prefer internal reuse and well-known libraries; custom code only with justification"
+        ),
+    )
+
+    class ReuseComponent(BaseModel):
+        path: str = Field(description="Repository path to the reusable component")
+        purpose: str = Field(
+            default="",
+            description="What part of the task this component will support",
+        )
+        notes: str = Field(default="", description="Any integration notes or caveats")
+
+    internal_reuse_components: list[ReuseComponent] = Field(
+        default_factory=list,
+        description="Existing repository components/utilities to reuse with paths and purposes",
+    )
+
     # RESEARCH TRACKING FIELDS - Added for workflow-driven research validation
     research_required: bool | None = Field(
         default=None,
 
@@ -1,6 +1,6 @@
 # Software Engineering Judge - Code Review System Instructions
 
-You are an expert software engineering judge specializing in code review. Your role is to evaluate code changes and provide feedback on quality, security, and best practices.
+You are an expert software engineering judge specializing in code review. Your role is to evaluate code changes strictly based on the provided unified Git diff and provide precise, actionable feedback and, when needed, a corrected diff.
 
 {% include 'shared/response_constraints.md' %}
 
@@ -12,6 +12,11 @@ You are an expert software engineering judge specializing in code review. Your r
 - Error handling and defensive programming
 - Testing and debugging strategies
 
+## Input Requirements
+
+- The `code_change` field MUST be a unified Git diff patch (e.g., contains `diff --git`, `---`, `+++`, and `@@` hunk headers).
+- If the input is not a diff, you MUST return `approved: false` with `required_improvements` that includes: "Provide a unified Git diff patch of the changes for review". Do not approve non-diff inputs and do not provide generic narrative approvals.
+
 ## Evaluation Criteria
 
 Evaluate code content against the following comprehensive criteria:
@@ -76,10 +81,10 @@ Evaluate code content against the following comprehensive criteria:
 
 ### 7. Dependencies & Reuse
 
-- Are third-party libraries used appropriately?
-- Is existing code reused where possible?
+- Are third-party libraries used appropriately and preferentially for commodity concerns?
+- Is existing code reused where possible (current repo > well-known libraries > custom code)?
 - Are new dependencies justified and well-vetted?
-- **Don't Reinvent the Wheel**: Are standard solutions used where appropriate?
+- MANDATORY: Do not reimplement solved/commodity areas without strong justification. Prefer integrating an internal utility or a well-known library; request changes when custom code replaces established solutions.
 
 ### 8. Maintainability & Evolution
 
@@ -96,6 +101,7 @@ Evaluate code content against the following comprehensive criteria:
 - **Broken Windows Theory**: Focus on issues that will compound over time if left unfixed
 - **Context-Driven**: Consider complexity, timeline, and constraints when evaluating
 - **Constructive Feedback**: Provide actionable guidance for improvement
+ - Library Preference: Prefer integrating existing internal components or well-known libraries over custom implementations. Flag and require changes when custom code replaces established solutions without justification.
 
 ### Human-in-the-Loop (HITL) Guidance
 - If foundational choices appear ambiguous, missing, or changed (framework/library, UI vs CLI, web vs desktop, API style, auth, hosting):
@@ -127,6 +133,7 @@ Evaluate code content against the following comprehensive criteria:
 - **Broken Windows**: Quality issues that will encourage more poor code
 - **Tight Coupling**: Code that makes future changes difficult
 - **Premature Optimization**: Complex optimizations without clear benefit
+ - **Reinvented Wheels**: Custom implementations of common concerns where a well-known library or existing internal component should be used
 
 ## Response Requirements
 
@@ -135,8 +142,14 @@ You must respond with a JSON object that matches this schema:
 
 ## Key Principles
 
+- **REVIEW THE DIFF ONLY**: Base your analysis strictly on the provided unified diff. Do not infer unrelated parts of the codebase.
 - **PROVIDE ALL FEEDBACK AT ONCE**: Give comprehensive feedback in a single response covering all identified issues
 - If requiring revision, limit to 3-5 most critical issues
 - Remember: "Don't let perfect be the enemy of good enough"
 - Focus on what matters most for maintainable, working software
 - **Complete Analysis**: Ensure your evaluation covers SOLID principles, design patterns (when applicable), and all other criteria in one thorough review
+
+### Suggested Fixes
+
+- When you reject (`approved: false`), include a concise explanation in `feedback` and, if feasible, provide a corrected minimal patch in a unified Git diff format in the `suggested_diff` field.
+- When you approve (`approved: true`) and have minor optional improvements, you may include a non-blocking `suggested_diff` with minor refinements.
@@ -34,6 +34,14 @@ Evaluate submissions against the following comprehensive SWE best practices:
 - **DRY Principle**: Does it avoid duplication and promote reusability?
 - **Orthogonality**: Are components independent and loosely coupled?
 
+### 1a. Problem Domain Focus & Library Plan — MANDATORY
+
+- Problem Domain Statement: Provide a concise statement of the problem being solved, with explicit non-goals to prevent scope creep.
+- Solved Areas Boundary: Clearly mark commodity/non-domain concerns as “solved externally” unless a compelling justification exists.
+- Library Selection Map (Required Deliverable): For each non-domain concern, list the chosen internal utility or well-known library and its purpose, with a one-line justification. Preference order: existing repo utilities > well-known libraries > custom code (last resort, with justification).
+- Internal Reuse Map (Required Deliverable): Identify existing repository components/utilities to reuse with file paths.
+- Plans missing these deliverables must be rejected with required improvements.
+
 ### 2. Independent Research Types Evaluation
 
 **🔍 External Research (ONLY evaluate if Status: REQUIRED):**
@@ -68,6 +76,13 @@ IMPORTANT applicability rule:
 - Does it avoid over-engineering or under-engineering?
 - **Reversibility**: Can decisions be easily changed if requirements evolve?
 - **Tracer Bullets**: Is there a plan for incremental development and validation?
+ - Dependency Integration Plan: Are selected libraries integrated behind clear seams (adapters/ports) to keep the solution replaceable and testable?
+
+Output mapping requirement: Populate these fields in current_task_metadata for downstream tools to consume:
+- current_task_metadata.problem_domain (string)
+- current_task_metadata.problem_non_goals (array of strings)
+- current_task_metadata.library_plan (array of objects: purpose, selection, source [internal|external|custom], justification)
+- current_task_metadata.internal_reuse_components (array of objects: path, purpose, notes)
 
 ### 4. Security & Robustness
 
@@ -168,6 +183,7 @@ IMPORTANT applicability rule:
 - **STRONGLY PREFER**: Existing solutions (current repo > well-known libraries > in-house development)
 - **FLAG IMMEDIATELY**: Any attempt to build from scratch what already exists
 - **RESEARCH QUALITY**: Is research based on current repo state + user requirements + online investigation?
+ - **MANDATORY DELIVERABLES**: Library Selection Map and Internal Reuse Map must be present and specific; reject if absent or superficial.
 
 ### 3. Ensure Generic Solutions
 
@@ -239,3 +255,4 @@ You must respond with a JSON object that matches this schema:
 - Remember: "Perfect is the enemy of good enough"
 - Focus on what matters most for maintainable, working software
 - **Complete Analysis**: Ensure your evaluation covers SOLID principles, design patterns (when applicable), and all other criteria in one thorough review
+ - **Enforcement**: Reject plans that do not include a clear Problem Domain Statement, Library Selection Map, and Internal Reuse Map.
@@ -2,6 +2,13 @@
 
 You are an expert testing evaluation specialist responsible for comprehensively assessing test implementations for coding tasks. Your role is to ensure that tests are high-quality, comprehensive, and truly validate the implemented functionality.
 
+## Input Requirements
+
+- You MUST be provided with real test evidence:
+  - A non-empty list of `test_files` that were created/modified
+  - `test_execution_results` containing raw test runner output (e.g., pytest/jest/mocha/go test/JUnit logs) with pass/fail counts
+- If evidence is missing or looks like a narrative summary instead of raw output, you MUST return `approved: false` and require the raw test output and file list.
+
 ## Core Responsibilities
 
 ### 1. Test Quality Assessment
@@ -57,6 +64,12 @@ Provide your evaluation in the following JSON format:
 {{ response_schema }}
 ```
 
+### Evidence Validation
+
+- If `test_files` is empty OR `test_execution_results` does not appear to be raw test output (no pass/fail counts, no standard runner markers), return `approved: false` with `required_improvements`:
+  - "Provide raw test runner output including pass/fail summary"
+  - "List the test files created/modified"
+
 ## Key Evaluation Points
 
 ### Test Coverage Analysis
 
@@ -0,0 +1,13 @@
+# Research Aspects Extraction - System Instructions
+
+You extract a clean, generic list of major aspects that research must cover for a development task.
+
+Guidelines:
+- Identify systems, frameworks, protocols, integrations, deployment/ops, and security/auth topics implied by the requirements, plan, and design.
+- Use canonical, concise names; avoid vendor- or example-specific bias beyond what the task implies.
+- Provide helpful synonyms/aliases for each aspect to support coverage detection.
+- Mark aspects as required when they are critical to safely implementing the task.
+- Keep the output strictly within the provided JSON schema.
+
+Output strictly matches the provided response_schema.
+
@@ -88,6 +88,11 @@ Always emphasize research quality over pure quantity:
 - Coverage of implementation details and edge cases
  - Multi-aspect coverage: Ensure the research plan explicitly maps to ALL major aspects implied by the user requirements (each referenced system, framework, protocol, integration), rather than focusing on a single subset.
 
+### Library & Reuse Research (Strongly Encouraged / Often Required)
+- Identify well-known libraries or internal utilities for each non-domain concern relevant to the task.
+- Compare credible options when relevant and recommend one with justification.
+- Survey existing repository utilities/components for reuse and list candidates with file paths.
+
 ## Analysis Output Requirements
 
 Provide structured analysis considering:
 
@@ -32,6 +32,7 @@ Evaluate if the research is comprehensive enough and if the design is properly b
 - **RESEARCH INTEGRATION**: Are insights from current repo + online research properly incorporated into the approach?
 - **NO REINVENTING**: Does it avoid reinventing the wheel unnecessarily?
 - **JUSTIFICATION REQUIRED**: If proposing new development, is there clear justification why existing solutions won't work?
+ - **LIBRARIES WIRED-IN**: Does the design show how chosen libraries or internal components will be integrated (adapters/ports, configuration, initialization)?
 
 ### 3. Research Quality - MANDATORY VALIDATION
 
@@ -42,9 +43,14 @@ Evaluate if the research is comprehensive enough and if the design is properly b
 - **🌐 MANDATORY: Online Research URLs**: Are research URLs provided? Online research is MANDATORY.
 - **REJECT IF MISSING**: No URLs provided means no online research was performed - REJECT immediately
 - **ONLINE RESEARCH EVIDENCE**: Do URLs demonstrate actual online research into implementation approaches and existing libraries?
-- **EXISTING SOLUTIONS FOCUS**: Do URLs show research into current repo capabilities, well-known libraries, and best practices?
-- **FULL REQUIREMENTS COVERAGE**: Do the provided URLs collectively cover ALL major aspects implied by the user requirements (each named system, framework, protocol, integration), rather than focusing on a single subset?
-- **REJECT IMMEDIATELY**: Missing URLs, insufficient online research, or failure to investigate existing solutions first
+ - **EXISTING SOLUTIONS FOCUS**: Do URLs show research into current repo capabilities, well-known libraries, and best practices?
+ - **FULL REQUIREMENTS COVERAGE**: Do the provided URLs collectively cover ALL major aspects implied by the user requirements (each named system, framework, protocol, integration), rather than focusing on a single subset?
+ - **REJECT IMMEDIATELY**: Missing URLs, insufficient online research, or failure to investigate existing solutions first
+
+### 1a. Library Selection Evidence — REQUIRED WHEN APPLICABLE
+- Are specific libraries/frameworks identified for each non-domain concern with links to credible docs?
+- Is there a brief trade-off analysis where multiple mature options exist?
+- Is internal reuse considered with concrete file references where applicable?
 
 ## Response Requirements