Skip to content

Commit 39034ca

Browse files
feat: close architecture cockpit implementation gaps across extractors, pipeline, and MCP tools
Multi-language support: - Add test extraction for Java (JUnit), Go, PHP (PHPUnit), Ruby (RSpec/Minitest), C# (NUnit/xUnit) - Add signal collection (symbol_hints, call_sites, assertions) for PHP, Ruby, C# tests - Add AST-based UI route extraction for Java Spring, Python Flask/FastAPI, Go HTTP, PHP Laravel/Symfony, Ruby Rails, C# ASP.NET - Restore hybrid AST+LLM rule extraction: tree-sitter splits code units, LLM analyzes each concurrently via asyncio.gather, container names tracked Schema extraction: - Add deterministic parsers for Django models, SQLAlchemy models, Prisma schema - LLM fallback preserved for unsupported ORM formats Knowledge graph fixes: - Fix import edges silently dropped: resolve IMPORTS to file-level edges via batch symbol-to-file lookup (eliminates N+1 queries) - Fix orphan SYMBOL cleanup scoped to source URIs (was deleting other sources' nodes) - Fix _map_search_to_nodes natural_key mismatch: FILE nodes use raw URIs, not file: prefixed keys — graph_rag queries now find local context - Fix entity extraction: use pre-split Chunk table instead of whole documents, removing silent 8000-char truncation that discarded most of large files Architecture cockpit: - Make arc42 sections data-driven: sections 1,2,4,8,9 now pull real extracted data instead of static templates; section 10 computes quality from fact confidence when quality_summary unavailable - Enable arc42 generation on sync by default (arch_docs_generate_on_sync=true) - Fix partial sync skipping GraphRAG chain: steps 5-7 now check if semantic entities exist before skipping, preventing permanently empty graph on re-syncs Code quality: - Extract shared helpers to ast_utils.py: find_enclosing_class_name, ruby_first_string_arg, java_annotation_names, csharp_attribute_names - Fix flows.py duplicate symbol collection bug - Add GraphRAG truncation logging and failure diagnostics - Make MCP error messages actionable: research_validation, research_data_model, research_architecture, graph_rag, get_arc42, arc42_drift_report now explain possible causes and remediation steps instead of bare "not found"
1 parent 4c66a34 commit 39034ca

File tree

17 files changed

+2490
-168
lines changed

17 files changed

+2490
-168
lines changed

apps/api/app/mcp_server.py

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1325,7 +1325,15 @@ async def research_validation(
13251325
matched_candidates = _match_candidates_by_query(all_candidates, query_words)
13261326

13271327
if not matched_rules and not matched_candidates:
1328-
return f"# No Validation Rules Found\n\nNo business rules or validation logic found for: `{code_path}`"
1328+
return (
1329+
f"# No Validation Rules Found\n\n"
1330+
f"No business rules or validation logic found for: `{code_path}`\n\n"
1331+
"**Possible causes:**\n"
1332+
"- The source hasn't been synced yet — trigger a sync to run extraction\n"
1333+
"- No LLM provider is configured (`DEFAULT_LLM_PROVIDER` env var) — "
1334+
"business rule extraction requires an LLM\n"
1335+
"- The code genuinely has no validation patterns for this path"
1336+
)
13291337

13301338
return _format_validation_results(code_path, matched_rules, matched_candidates)
13311339

@@ -1481,7 +1489,14 @@ async def research_data_model(
14811489
erd_artifact = result.scalar_one_or_none()
14821490

14831491
if not tables and not columns and not endpoints:
1484-
return f"# No Data Model Found\n\nNo tables, columns, or APIs found for: `{entity}`"
1492+
return (
1493+
f"# No Data Model Found\n\n"
1494+
f"No tables, columns, or APIs found for: `{entity}`\n\n"
1495+
"**Possible causes:**\n"
1496+
"- The source hasn't been synced yet — trigger a sync to run schema extraction\n"
1497+
"- No SQL/DDL, Django, SQLAlchemy, Prisma, or migration files were found\n"
1498+
"- Try a broader search term (e.g., a table name or entity name)"
1499+
)
14851500

14861501
return _format_data_model_results(
14871502
entity, entity_lower, tables, columns, endpoints, erd_artifact
@@ -1746,7 +1761,11 @@ async def research_architecture(
17461761
if len(lines) == 1:
17471762
lines.append(f"No specific architecture information found for topic: `{topic}`\n")
17481763
lines.append(
1749-
"Try topics like: api, deployment, database, security, ui, tests, flows"
1764+
"Try topics like: api, deployment, database, security, ui, tests, flows\n"
1765+
)
1766+
lines.append(
1767+
"**If all topics are empty**, the knowledge graph may not be populated yet. "
1768+
"Ensure the source has been synced and extraction completed successfully."
17501769
)
17511770

17521771
return "\n".join(lines)
@@ -1988,7 +2007,15 @@ async def _graph_rag_context(
19882007
or md
19892008
== f"# GraphRAG Context: {query}\n\nFound 0 communities, 0 entities, 0 citations.\n"
19902009
):
1991-
return f"# No Results\n\nNo relevant content found for: {query}"
2010+
return (
2011+
f"# No Results\n\n"
2012+
f"No relevant content found for: {query}\n\n"
2013+
"**Possible causes:**\n"
2014+
"- The knowledge graph has no semantic entities yet — ensure sync completed "
2015+
"with LLM extraction enabled\n"
2016+
"- Community summaries haven't been generated or embedded\n"
2017+
"- Try rephrasing the query with different terms"
2018+
)
19922019

19932020
if rebuild_mode:
19942021
md += (
@@ -2571,9 +2598,13 @@ async def mcp_get_arc42(
25712598

25722599
if not regenerate:
25732600
return (
2574-
"# Error\n\n"
2575-
"arc42 artifact not generated yet. Trigger explicit generation with "
2576-
"`regenerate=true`."
2601+
"# arc42 Not Generated Yet\n\n"
2602+
"The arc42 architecture document hasn't been generated for this collection.\n\n"
2603+
"**To generate it:**\n"
2604+
"1. Ensure the source has been synced (this builds the knowledge graph)\n"
2605+
"2. Call `get_arc42(regenerate=true)` to generate the document\n\n"
2606+
"If `arch_docs_generate_on_sync` is enabled (default), "
2607+
"arc42 is generated automatically during sync."
25772608
)
25782609

25792610
return await _arc42_regenerate(
@@ -2660,7 +2691,12 @@ async def _resolve_drift_baseline(
26602691
)
26612692
).scalar_one_or_none()
26622693
if baseline is None:
2663-
return None, "# Error\n\nBaseline scenario not found in collection."
2694+
return None, (
2695+
"# Error\n\n"
2696+
"Baseline scenario not found in collection.\n\n"
2697+
"Drift reports compare two scenarios. Ensure a baseline scenario exists "
2698+
"by running at least two syncs, or specify a valid `baseline_scenario_id`."
2699+
)
26642700
return baseline, None
26652701

26662702
baseline = None

apps/worker/contextmine_worker/flows.py

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -945,8 +945,10 @@ async def _kg_extract_business_rules(
945945
from contextmine_core.treesitter.languages import detect_language
946946

947947
if changed_doc_ids is not None and len(changed_doc_ids) == 0:
948-
logger.info("No changed documents - skipping business rule extraction")
949-
return 0
948+
if await _kg_has_business_rules(collection_uuid):
949+
logger.info("No changed documents and business rules exist - skipping extraction")
950+
return 0
951+
logger.info("No changed documents but no business rules found - running initial extraction")
950952

951953
all_extractions = []
952954
async with get_session() as session:
@@ -1032,6 +1034,38 @@ async def _kg_extract_surfaces(source_uuid: object, collection_uuid: object) ->
10321034
return result_stats
10331035

10341036

1037+
async def _kg_has_semantic_entities(collection_uuid: object) -> bool:
1038+
"""Check if any SEMANTIC_ENTITY nodes exist for this collection."""
1039+
from contextmine_core.models import KnowledgeNode, KnowledgeNodeKind
1040+
1041+
async with get_session() as session:
1042+
result = await session.execute(
1043+
select(KnowledgeNode.id)
1044+
.where(
1045+
KnowledgeNode.collection_id == collection_uuid,
1046+
KnowledgeNode.kind == KnowledgeNodeKind.SEMANTIC_ENTITY,
1047+
)
1048+
.limit(1)
1049+
)
1050+
return result.scalar_one_or_none() is not None
1051+
1052+
1053+
async def _kg_has_business_rules(collection_uuid: object) -> bool:
1054+
"""Check if any BUSINESS_RULE nodes exist for this collection."""
1055+
from contextmine_core.models import KnowledgeNode, KnowledgeNodeKind
1056+
1057+
async with get_session() as session:
1058+
result = await session.execute(
1059+
select(KnowledgeNode.id)
1060+
.where(
1061+
KnowledgeNode.collection_id == collection_uuid,
1062+
KnowledgeNode.kind == KnowledgeNodeKind.BUSINESS_RULE,
1063+
)
1064+
.limit(1)
1065+
)
1066+
return result.scalar_one_or_none() is not None
1067+
1068+
10351069
async def _kg_step_semantic_entities(
10361070
stats: dict,
10371071
collection_uuid: object,
@@ -1047,8 +1081,15 @@ async def _kg_step_semantic_entities(
10471081
)
10481082

10491083
if changed_doc_ids is not None and len(changed_doc_ids) == 0:
1050-
logger.info("No changed documents - skipping semantic entity extraction")
1051-
return
1084+
# No docs changed — skip only if entities already exist from a prior run
1085+
if await _kg_has_semantic_entities(collection_uuid):
1086+
logger.info(
1087+
"No changed documents and semantic entities exist - skipping extraction"
1088+
)
1089+
return
1090+
logger.info(
1091+
"No changed documents but no semantic entities found - running initial extraction"
1092+
)
10521093
async with get_session() as session:
10531094
extraction_batch = await extract_from_documents(
10541095
collection_id=collection_uuid,
@@ -1112,8 +1153,16 @@ async def _kg_step_summaries(
11121153
from contextmine_core.knowledge.summaries import generate_community_summaries
11131154

11141155
if changed_doc_ids is not None and len(changed_doc_ids) == 0:
1115-
logger.info("No changed documents - skipping community summary regeneration")
1116-
return
1156+
# No docs changed — skip only if semantic entities (and thus summaries)
1157+
# already exist from a prior run
1158+
if await _kg_has_semantic_entities(collection_uuid):
1159+
logger.info(
1160+
"No changed documents and semantic entities exist - skipping summary regeneration"
1161+
)
1162+
return
1163+
logger.info(
1164+
"No changed documents but no semantic entities found - running initial summary generation"
1165+
)
11171166
async with get_session() as session:
11181167
summary_stats = await generate_community_summaries(
11191168
session,

apps/worker/tests/test_flows_final.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,10 +360,10 @@ async def mock_exec(stmt):
360360

361361

362362
class TestBuildKGSemanticCommunity:
363-
async def test_skip_semantic_when_no_changed_docs(
363+
async def test_skip_semantic_when_no_changed_docs_and_entities_exist(
364364
self, monkeypatch: pytest.MonkeyPatch
365365
) -> None:
366-
"""Line 935-936: Empty changed_doc_ids skips semantic extraction."""
366+
"""Empty changed_doc_ids skips semantic extraction when entities already exist."""
367367
source_id = str(uuid.uuid4())
368368
collection_id = str(uuid.uuid4())
369369

@@ -375,10 +375,13 @@ async def test_skip_semantic_when_no_changed_docs(
375375

376376
mock_session = AsyncMock()
377377

378+
# Return a non-None value for _kg_has_semantic_entities check
379+
existing_node_id = uuid.uuid4()
380+
378381
async def mock_exec(stmt):
379382
r = MagicMock()
380383
r.all.return_value = []
381-
r.scalar_one_or_none.return_value = None
384+
r.scalar_one_or_none.return_value = existing_node_id
382385
return r
383386

384387
mock_session.execute = mock_exec
@@ -417,6 +420,7 @@ async def mock_exec(stmt):
417420
collection_id=collection_id,
418421
changed_doc_ids=[],
419422
)
423+
# Skips extraction because semantic entities already exist
420424
extract_mock.assert_not_called()
421425

422426
async def test_semantic_extraction_error_caught(self, monkeypatch: pytest.MonkeyPatch) -> None:

packages/core/contextmine_core/analyzer/extractors/ast_utils.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,92 @@ def is_pascal_case(value: str) -> bool:
7676
return all(ch.isalnum() or ch == "_" for ch in value)
7777

7878

79+
# ---------------------------------------------------------------------------
80+
# Cross-language AST helpers shared between extractors
81+
# ---------------------------------------------------------------------------
82+
83+
84+
def find_enclosing_class_name(
85+
content: str,
86+
node: Any,
87+
class_type: str = "class_declaration",
88+
name_fields: tuple[str, ...] = ("identifier",),
89+
) -> str | None:
90+
"""Walk up the AST to find the enclosing class name."""
91+
parent = node.parent
92+
while parent is not None:
93+
if parent.type == class_type:
94+
for field_name in name_fields:
95+
name_node = first_child(parent, field_name)
96+
if name_node:
97+
name = node_text(content, name_node).strip()
98+
if name:
99+
return name
100+
break
101+
parent = parent.parent
102+
return None
103+
104+
105+
def ruby_first_string_arg(content: str, call_node: Any) -> str | None:
106+
"""Extract the first string argument from a Ruby call node."""
107+
args = call_node.child_by_field_name("arguments")
108+
if args is None:
109+
for child in call_node.children:
110+
if child.type == "argument_list":
111+
args = child
112+
break
113+
if args is None:
114+
return None
115+
for child in args.children:
116+
if child.type in {"string", "string_literal"}:
117+
return unquote(node_text(content, child))
118+
return None
119+
120+
121+
def java_annotation_names(content: str, node: Any) -> list[str]:
122+
"""Extract annotation names from a Java method/class node's modifiers."""
123+
names: list[str] = []
124+
parent = node.parent
125+
if parent is None:
126+
return names
127+
for child in parent.children:
128+
if child.type == "modifiers":
129+
for mod in child.children:
130+
if mod.type in {"marker_annotation", "annotation"}:
131+
name_node = first_child(mod, "identifier")
132+
if name_node:
133+
names.append(node_text(content, name_node).strip().lower())
134+
return names
135+
136+
137+
def csharp_attribute_names(content: str, node: Any) -> set[str]:
138+
"""Extract attribute names from a C# node's preceding attribute_list siblings."""
139+
attrs: set[str] = set()
140+
parent = node.parent
141+
if parent is None:
142+
return attrs
143+
for child in parent.children:
144+
if child is node:
145+
break
146+
if child.type == "attribute_list":
147+
for attr in walk(child):
148+
if attr.type in {"identifier", "attribute"}:
149+
name = node_text(content, attr).strip().lower()
150+
if name.endswith("attribute"):
151+
name = name[: -len("attribute")]
152+
attrs.add(name)
153+
# Also check direct children
154+
for child in node.children:
155+
if child.type == "attribute_list":
156+
for attr in walk(child):
157+
if attr.type in {"identifier", "attribute"}:
158+
name = node_text(content, attr).strip().lower()
159+
if name.endswith("attribute"):
160+
name = name[: -len("attribute")]
161+
attrs.add(name)
162+
return attrs
163+
164+
79165
# ---------------------------------------------------------------------------
80166
# JS/TS AST helpers shared between the tests and UI extractors
81167
# ---------------------------------------------------------------------------

packages/core/contextmine_core/analyzer/extractors/flows.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,6 @@ def synthesize_user_flows(
142142
route_to_symbol_hints, route_to_navigation_hints = _collect_route_hints_from_ui(ui_extractions)
143143
symbol_to_test_refs = _collect_symbol_test_refs(test_extractions)
144144

145-
for test_file in test_extractions:
146-
for case in test_file.cases:
147-
for symbol_hint in case.symbol_hints:
148-
symbol_to_test_refs.setdefault(symbol_hint.lower(), [])
149-
symbol_to_test_refs[symbol_hint.lower()].append(case.natural_key)
150-
151145
synthesis = FlowSynthesis()
152146
for route, symbol_hints in sorted(route_to_symbol_hints.items()):
153147
deduped_hints = list(

0 commit comments

Comments
 (0)