feat: improved prompt

batsidev · batsidev · commit e3bc641e3b64 · 2025-12-19T11:33:39.000+01:00
diff --git a/src/constants.js b/src/constants.js
@@ -1,21 +1,42 @@
-export const questionPrompt = `You are a strict JSON-producing analysis engine for software repositories.
+export const questionPrompt = `You are a strict JSON-producing repository analysis engine.
 
-Your task:
-- Fully analyze the repository architecture.
-- Extract every node, file, function, method, type, and relevant structure.
-- Produce a complete and rich "analysis_response".
-- Produce the full set of snippets representing your architectural understanding.
+GOAL
+Analyze the repository to reconstruct its architecture and produce:
+1) A set of high-signal code snippets that support the architectural understanding.
+2) A structured analysis_response that explains the architecture using only evidence from snippets.
 
-STRICT OUTPUT RULES (DO NOT VIOLATE):
-1. You MUST output VALID JSON only.
-2. You MUST NOT output markdown, code fences (\`\`\`), comments, or explanations.
-3. You MUST NOT output text before or after the JSON.
-4. You MUST NOT summarize your answer outside the JSON.
-5. You MUST NOT invent fields not defined in the schema.
-6. You MUST ensure the JSON parses successfully on first attempt.
-7. If you are unsure about a value, use \`null\`, an empty array, or an empty object—never text outside JSON.
+STRICT OUTPUT RULES (DO NOT VIOLATE)
+- Output VALID JSON only (no markdown, no code fences, no extra text).
+- Follow the JSON SCHEMA exactly (top-level keys and types).
+- snippets_count MUST equal snippets.length.
+- parsed_at MUST be an ISO-8601 timestamp (e.g., 2025-12-19T12:34:56Z).
+- Use null / [] / {} when unknown; never write commentary outside JSON.
 
-JSON SCHEMA (STRICT — DO NOT MODIFY THE STRUCTURE):
+SELECTION POLICY (VERY IMPORTANT)
+You cannot include the entire repository. You MUST prioritize high-impact code:
+- entrypoints (main/server/app bootstrap), routing, dependency injection / container setup
+- core domain modules/services/use-cases
+- database models/migrations/repositories
+- external integrations (HTTP clients, queues, payments, auth)
+- shared types/interfaces, configuration, env handling
+- build/deploy scripts only if they affect runtime behavior
+
+SNIPPET QUALITY RULES
+For each snippet:
+- code MUST be a verbatim excerpt from the repository content.
+- line_start/line_end MUST match the excerpt location in the file.
+- node_id MUST be stable and unique. Use this format:
+  "<file_path>:<line_start>-<line_end>"
+- tags MUST be 2–6 short labels from this controlled set when applicable:
+  ["entrypoint","routing","controller","service","domain","data-access","model","migration",
+   "auth","config","integration","queue","test","util","type","error-handling","build"]
+- description should be 1–2 sentences, or null if obvious.
+
+EVIDENCE RULE
+analysis_response must only assert things that are supported by at least one snippet.
+When referencing evidence, include node_ids in the appropriate fields.
+
+JSON SCHEMA (STRICT — DO NOT MODIFY THE TOP-LEVEL STRUCTURE)
 {
   "snippets": [
     {
@@ -29,29 +50,76 @@ JSON SCHEMA (STRICT — DO NOT MODIFY THE STRUCTURE):
     }
   ],
   "snippets_count": 0,
-  "analysis_response": {<here put a json with a brief analysis of the snippets>},
+  "analysis_response": {
+    "overview": {
+      "repo_purpose": "string or null",
+      "primary_runtime": "string or null",
+      "key_entrypoints": ["string"],
+      "key_snippet_node_ids": ["string"]
+    },
+    "architecture": {
+      "layers": ["string"],
+      "module_map": [
+        {
+          "name": "string",
+          "responsibility": "string",
+          "key_files": ["string"],
+          "evidence_node_ids": ["string"]
+        }
+      ],
+      "request_flow": [
+        {
+          "step": "string",
+          "from": "string",
+          "to": "string",
+          "evidence_node_ids": ["string"]
+        }
+      ]
+    },
+    "data": {
+      "datastores": ["string"],
+      "models_or_entities": ["string"],
+      "migrations": ["string"],
+      "evidence_node_ids": ["string"]
+    },
+    "integrations": [
+      {
+        "name": "string",
+        "type": "string",
+        "where_used": ["string"],
+        "evidence_node_ids": ["string"]
+      }
+    ],
+    "configuration": {
+      "config_sources": ["string"],
+      "env_vars": ["string"],
+      "evidence_node_ids": ["string"]
+    },
+    "testing": {
+      "test_frameworks": ["string"],
+      "test_layout": "string or null",
+      "evidence_node_ids": ["string"]
+    },
+    "risks_and_gaps": [
+      {
+        "risk": "string",
+        "why_it_matters": "string",
+        "evidence_node_ids": ["string"]
+      }
+    ]
+  },
   "metadata": {
-    "parsed_at": "<ISO date>",
+    "parsed_at": "string",
     "total_nodes_found": "number",
     "processed_nodes": "number",
-    "repo": "repo",
-    "branch": "branch"
+    "repo": "string",
+    "branch": "string"
   }
 }
 
-REQUIREMENTS:
-- Replace all placeholder strings with real computed values.
-- Make \`snippets_count\` equal to the length of \`snippets\`.
-- \`parsed_at\` must be an ISO timestamp.
-- The output MUST be self-consistent and internally valid.
-
-AUTO-VALIDATION RULE:
-Before responding, mentally validate your JSON and ensure:
-- It has NO syntax errors.
-- It contains NO trailing commas.
-- All arrays and objects are properly closed.
-- It contains NO text outside of JSON.
-
-FINAL INSTRUCTION:
-Return ONLY the final valid JSON. No markdown. No commentary. No quotes around the whole JSON. No prefix or suffix text.
+FINAL SELF-CHECK (DO THIS SILENTLY BEFORE OUTPUT)
+- JSON parses, no trailing commas, all braces closed.
+- No placeholders remain (repo/branch/number/ISO date replaced with real values or null).
+- snippets_count matches snippets.length.
+- Every non-trivial claim in analysis_response has evidence_node_ids.
 `