diff --git a/build_ast_graph.py b/build_ast_graph.py index 403d790..30955a6 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -39,6 +39,7 @@ from pathlib import Path import ladybug +import pyarrow as pa from ast_java import ( ONTOLOGY_VERSION, @@ -3004,6 +3005,63 @@ def _node_row(**kwargs) -> dict: return base +def _bulk_copy(conn: ladybug.Connection, table_name: str, columns: list[str], rows: list[dict]) -> None: + """Bulk-load rows into a node/rel table via in-memory pyarrow COPY FROM. + + `columns` fixes column order; for REL tables the first two MUST be the + FROM/TO node primary keys (kuzu requirement). Empty `rows` is a no-op. + + Spike result (PR-P1 step-1): REL `COPY FROM` expects columns named `FROM` and `TO` + for the endpoint node IDs, followed by property columns in the declared order. + `pa.Table.from_pylist(rows)` correctly infers types from the dict values. + """ + if not rows: + return + tbl = pa.Table.from_pylist(rows) + conn.execute(f"COPY {table_name} FROM $rows", {"rows": tbl}) + + +def _existing_symbol_ids(conn: ladybug.Connection) -> set[str]: + """Return every Symbol node id currently in the graph. + + Bulk ``COPY FROM`` enforces referential integrity: a REL row whose FROM/TO + endpoint isn't a loaded node raises ``Unable to find primary key value``. The + legacy per-row ``MERGE (a:Symbol {id:$src}),(b:Symbol {id:$dst})`` silently + dropped such edges (a ``MATCH`` against a missing endpoint creates nothing). + ``_write_edges`` filters edge rows against this set to reproduce that exactly. + + This queries the live DB rather than just ``tables`` because ``_write_edges`` + is shared with the incremental path, whose edges legitimately reference nodes + written in prior runs. Both paths call ``_write_nodes`` before ``_write_edges``, + so freshly written nodes are included. + """ + result = conn.execute("MATCH (n:Symbol) RETURN n.id") + ids: set[str] = set() + while result.has_next(): + ids.add(result.get_next()[0]) + return ids + + +# Column-order constants for bulk COPY FROM. +# For REL tables, the first two entries are FROM/TO node primary keys (kuzu requirement). +# Order matches the corresponding _SCHEMA_* declarations above. +_NODE_COLUMNS = [ + "id", "kind", "name", "fqn", "package", "module", "microservice", + "filename", "start_line", "end_line", "start_byte", "end_byte", + "modifiers", "annotations", "capabilities", "role", "signature", "parent_id", "resolved" +] + +_REL_EXTENDS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"] +_REL_IMPLEMENTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"] +_REL_INJECTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved", "mechanism", "annotation", "field_or_param"] +_REL_DECLARES_COLUMNS = ["FROM", "TO", "source_file"] +_REL_OVERRIDES_COLUMNS = ["FROM", "TO", "source_file"] +_REL_CALLS_COLUMNS = ["FROM", "TO", "source_file", "call_site_line", "call_site_byte", "arg_count", "confidence", "strategy", "source", "resolved", "callee_declaring_role"] + +_UNRESOLVED_CALL_SITE_COLUMNS = ["id", "caller_id", "call_site_line", "call_site_byte", "arg_count", "callee_simple", "receiver_expr", "reason"] +_REL_UNRESOLVED_AT_COLUMNS = ["FROM", "TO", "source_file"] + + _CREATE_SYMBOL = ( "CREATE (:Symbol {id: $id, kind: $kind, name: $name, fqn: $fqn, " "package: $package, module: $module, microservice: $microservice, " @@ -3103,37 +3161,6 @@ def _write_nodes( _write_nodes_impl(conn, tables, project_root=project_root, meta_chain=meta_chain, symbol_query=_CREATE_SYMBOL) -_CREATE_EXT = ( - "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) " - "CREATE (a)-[:EXTENDS {source_file: $source_file, dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)" -) -_CREATE_IMPL = ( - "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) " - "CREATE (a)-[:IMPLEMENTS {source_file: $source_file, dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved}]->(b)" -) -_CREATE_INJ = ( - "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) " - "CREATE (a)-[:INJECTS {source_file: $source_file, dst_name: $dst_name, dst_fqn: $dst_fqn, resolved: $resolved, " - "mechanism: $mechanism, annotation: $annotation, field_or_param: $field_or_param}]->(b)" -) -_CREATE_DECL = ( - "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) " - "CREATE (a)-[:DECLARES {source_file: $source_file}]->(b)" -) -_CREATE_OVERRIDES = ( - "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) " - "CREATE (a)-[:OVERRIDES {source_file: $source_file}]->(b)" -) -_CREATE_CALL = ( - "MATCH (a:Symbol {id: $src}), (b:Symbol {id: $dst}) " - "CREATE (a)-[:CALLS {" - "source_file: $source_file, " - "call_site_line: $line, call_site_byte: $byte, arg_count: $argc, " - "confidence: $conf, strategy: $strat, source: $src_kind, resolved: $resolved, " - "callee_declaring_role: $callee_declaring_role" - "}]->(b)" -) - _CREATE_ROUTE = ( "CREATE (:Route {" "id: $id, kind: $kind, framework: $framework, method: $method, " @@ -3246,93 +3273,126 @@ def _write_edges(conn: ladybug.Connection, tables: GraphTables, _file_by_node_id if _file_by_node_id is None: _file_by_node_id = _build_file_by_node_id(tables) - for r in tables.extends_rows: - conn.execute(_CREATE_EXT, { - "src": r.src_id, "dst": r.dst_id, + # Bulk COPY FROM enforces referential integrity — a REL row whose endpoint + # node isn't loaded raises "Unable to find primary key value". The legacy + # per-row MERGE silently skipped such edges; drop them here to preserve the + # per-row graph exactly. _existing_symbol_ids reads the live DB (not just + # `tables`) so the incremental path's references to prior-run nodes still hold. + valid_ids = _existing_symbol_ids(conn) + + # Stage EXTENDS rows + extends_rows = [ + { + "FROM": r.src_id, "TO": r.dst_id, "source_file": _file_by_node_id.get(r.src_id, ""), "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved, - }) - for r in tables.implements_rows: - conn.execute(_CREATE_IMPL, { - "src": r.src_id, "dst": r.dst_id, + } + for r in tables.extends_rows + if r.src_id in valid_ids and r.dst_id in valid_ids + ] + _bulk_copy(conn, "EXTENDS", _REL_EXTENDS_COLUMNS, extends_rows) + + # Stage IMPLEMENTS rows + implements_rows = [ + { + "FROM": r.src_id, "TO": r.dst_id, "source_file": _file_by_node_id.get(r.src_id, ""), "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved, - }) - for r in tables.injects_rows: - conn.execute(_CREATE_INJ, { - "src": r.src_id, "dst": r.dst_id, + } + for r in tables.implements_rows + if r.src_id in valid_ids and r.dst_id in valid_ids + ] + _bulk_copy(conn, "IMPLEMENTS", _REL_IMPLEMENTS_COLUMNS, implements_rows) + + # Stage INJECTS rows + injects_rows = [ + { + "FROM": r.src_id, "TO": r.dst_id, "source_file": _file_by_node_id.get(r.src_id, ""), "dst_name": r.dst_name, "dst_fqn": r.dst_fqn, "resolved": r.resolved, "mechanism": r.mechanism, "annotation": r.annotation, "field_or_param": r.field_or_param, - }) + } + for r in tables.injects_rows + if r.src_id in valid_ids and r.dst_id in valid_ids + ] + _bulk_copy(conn, "INJECTS", _REL_INJECTS_COLUMNS, injects_rows) - for row in tables.declares_rows: - conn.execute(_CREATE_DECL, { - "src": row.src_id, "dst": row.dst_id, + # Stage DECLARES rows + declares_rows = [ + { + "FROM": row.src_id, "TO": row.dst_id, "source_file": _file_by_node_id.get(row.src_id, ""), - }) + } + for row in tables.declares_rows + if row.src_id in valid_ids and row.dst_id in valid_ids + ] + _bulk_copy(conn, "DECLARES", _REL_DECLARES_COLUMNS, declares_rows) - for row in tables.overrides_rows: - conn.execute(_CREATE_OVERRIDES, { - "src": row.src_id, "dst": row.dst_id, + # Stage OVERRIDES rows + overrides_rows = [ + { + "FROM": row.src_id, "TO": row.dst_id, "source_file": _file_by_node_id.get(row.src_id, ""), - }) + } + for row in tables.overrides_rows + if row.src_id in valid_ids and row.dst_id in valid_ids + ] + _bulk_copy(conn, "OVERRIDES", _REL_OVERRIDES_COLUMNS, overrides_rows) + # Stage CALLS rows with dedup and callee_declaring_role materialization seen_calls: set[tuple[str, str, int, int]] = set() - unique_calls: list[CallsRow] = [] + calls_rows: list[dict] = [] + member_by_id = {m.node_id: m for m in tables.members} for row in tables.calls_rows: + if row.src_id not in valid_ids or row.dst_id not in valid_ids: + continue key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line) - if key not in seen_calls: - seen_calls.add(key) - unique_calls.append(row) - - member_by_id = {m.node_id: m for m in tables.members} - for row in unique_calls: - conn.execute(_CREATE_CALL, { - "src": row.src_id, "dst": row.dst_id, + if key in seen_calls: + continue + seen_calls.add(key) + calls_rows.append({ + "FROM": row.src_id, "TO": row.dst_id, "source_file": _file_by_node_id.get(row.src_id, ""), - "line": row.call_site_line, - "byte": row.call_site_byte, - "argc": row.arg_count, - "conf": row.confidence, - "strat": row.strategy, - "src_kind": row.source, - "resolved": row.resolved, + "call_site_line": row.call_site_line, "call_site_byte": row.call_site_byte, + "arg_count": row.arg_count, "confidence": row.confidence, "strategy": row.strategy, + "source": row.source, "resolved": row.resolved, "callee_declaring_role": _callee_declaring_role_at_write( tables, row.dst_id, member_by_id=member_by_id, ), }) + _bulk_copy(conn, "CALLS", _REL_CALLS_COLUMNS, calls_rows) - _CREATE_UNRESOLVED = ( - "CREATE (:UnresolvedCallSite {" - "id: $id, caller_id: $caller_id, call_site_line: $line, call_site_byte: $byte, " - "arg_count: $argc, callee_simple: $callee, receiver_expr: $recv, reason: $reason" - "})" - ) - _CREATE_UNRESOLVED_AT = ( - "MATCH (a:Symbol {id: $caller}), (u:UnresolvedCallSite {id: $ucs}) " - "CREATE (a)-[:UNRESOLVED_AT {source_file: $source_file}]->(u)" - ) + # Stage UnresolvedCallSite node rows (must load before UNRESOLVED_AT edges) seen_ucs: set[str] = set() + ucs_rows: list[dict] = [] for row in tables.unresolved_call_site_rows: if row.id in seen_ucs: continue seen_ucs.add(row.id) - conn.execute(_CREATE_UNRESOLVED, { + ucs_rows.append({ "id": row.id, "caller_id": row.caller_id, - "line": row.call_site_line, - "byte": row.call_site_byte, - "argc": row.arg_count, - "callee": row.callee_simple, - "recv": row.receiver_expr, + "call_site_line": row.call_site_line, + "call_site_byte": row.call_site_byte, + "arg_count": row.arg_count, + "callee_simple": row.callee_simple, + "receiver_expr": row.receiver_expr, "reason": row.reason, }) - conn.execute(_CREATE_UNRESOLVED_AT, { - "caller": row.caller_id, "ucs": row.id, - "source_file": _file_by_node_id.get(row.caller_id, ""), - }) + _bulk_copy(conn, "UnresolvedCallSite", _UNRESOLVED_CALL_SITE_COLUMNS, ucs_rows) + + # Stage UNRESOLVED_AT edge rows (one per unique UnresolvedCallSite node) + # Use the same ucs_rows list to ensure 1:1 correspondence + unresolved_at_rows = [ + { + "FROM": ucs_row["caller_id"], "TO": ucs_row["id"], + "source_file": _file_by_node_id.get(ucs_row["caller_id"], ""), + } + for ucs_row in ucs_rows + if ucs_row["caller_id"] in valid_ids + ] + _bulk_copy(conn, "UNRESOLVED_AT", _REL_UNRESOLVED_AT_COLUMNS, unresolved_at_rows) def _write_routes_and_exposes(conn: ladybug.Connection, tables: GraphTables, _file_by_node_id: dict[str, str] | None = None) -> None: diff --git a/tests/fixtures/graph_baseline_bank_chat.json b/tests/fixtures/graph_baseline_bank_chat.json new file mode 100644 index 0000000..a42224a --- /dev/null +++ b/tests/fixtures/graph_baseline_bank_chat.json @@ -0,0 +1,514 @@ +{ + "node_count": 959, + "edge_counts": { + "EXTENDS": 18, + "IMPLEMENTS": 21, + "INJECTS": 94, + "DECLARES": 606, + "OVERRIDES": 38, + "CALLS": 678, + "UNRESOLVED_AT": 227 + }, + "graph_meta": { + "ontology_version": 17, + "built_at": 1782110216, + "source_root": "/Users/dmitry/Desktop/CursorProjects/java-enterprise-codebase-rag/tests/bank-chat-system", + "counts_json": "{packages: 29, files: 130, types: 140, members: 606, phantoms: 54, extends: 18, implements: 21, injects: 94, declares: 606, overrides: 38, calls: 678, routes: 29, exposes: 15, clients: 8, declares_client: 8, producers: 9, declares_producer: 9, http_calls: 8, async_calls: 9}" + }, + "sampled_edges": { + "EXTENDS": [ + [ + "ad9f9b25470043e863941d7623f4281fd1643934", + "f479db6c7da16352b33070ec9b13ee11e88176f0", + { + "_SRC": { + "offset": 190, + "table": 0 + }, + "_DST": { + "offset": 909, + "table": 0 + }, + "_LABEL": "EXTENDS", + "_ID": { + "offset": 0, + "table": 6 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/repo/AssignChatRepository.java", + "dst_name": "JpaRepository", + "dst_fqn": "org.springframework.data.jpa.repository.JpaRepository", + "resolved": false + } + ], + [ + "8e04e698d01026f86d860ac8e5edf808dfdffdb9", + "f479db6c7da16352b33070ec9b13ee11e88176f0", + { + "_SRC": { + "offset": 191, + "table": 0 + }, + "_DST": { + "offset": 909, + "table": 0 + }, + "_LABEL": "EXTENDS", + "_ID": { + "offset": 1, + "table": 6 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/repo/AssignSplitRepository.java", + "dst_name": "JpaRepository", + "dst_fqn": "org.springframework.data.jpa.repository.JpaRepository", + "resolved": false + } + ], + [ + "2152d3eeccb9cc139bd2f41259ff193e30ebffc2", + "f479db6c7da16352b33070ec9b13ee11e88176f0", + { + "_SRC": { + "offset": 192, + "table": 0 + }, + "_DST": { + "offset": 909, + "table": 0 + }, + "_LABEL": "EXTENDS", + "_ID": { + "offset": 2, + "table": 6 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/repo/AssignOperatorSessionRepository.java", + "dst_name": "JpaRepository", + "dst_fqn": "org.springframework.data.jpa.repository.JpaRepository", + "resolved": false + } + ] + ], + "IMPLEMENTS": [ + [ + "cabe9aec56202fdedae6ad64bde83a01449b9d84", + "3d3a9d206733cdc298529a59963e04b7ea6ec19d", + { + "_SRC": { + "offset": 187, + "table": 0 + }, + "_DST": { + "offset": 908, + "table": 0 + }, + "_LABEL": "IMPLEMENTS", + "_ID": { + "offset": 0, + "table": 8 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/domain/AssignOperatorSplitEntity.java", + "dst_name": "Serializable", + "dst_fqn": "java.io.Serializable", + "resolved": false + } + ], + [ + "3137c0434a3514545193ca7a89f2ee05bfc83bc8", + "9d2db7ab6598890c122e081ba40fe0d28c6f6f0f", + { + "_SRC": { + "offset": 220, + "table": 0 + }, + "_DST": { + "offset": 222, + "table": 0 + }, + "_LABEL": "IMPLEMENTS", + "_ID": { + "offset": 1, + "table": 8 + }, + "source_file": "chat-core/chat-engine/src/main/java/com/bank/chat/engine/pipeline/ClientSegmentFilter.java", + "dst_name": "EventFilter", + "dst_fqn": "com.bank.chat.engine.pipeline.EventFilter", + "resolved": true + } + ], + [ + "a1aed4824d920460874f718d400b51b9e4e6d306", + "9d2db7ab6598890c122e081ba40fe0d28c6f6f0f", + { + "_SRC": { + "offset": 221, + "table": 0 + }, + "_DST": { + "offset": 222, + "table": 0 + }, + "_LABEL": "IMPLEMENTS", + "_ID": { + "offset": 2, + "table": 8 + }, + "source_file": "chat-core/chat-engine/src/main/java/com/bank/chat/engine/pipeline/EventTypeFilter.java", + "dst_name": "EventFilter", + "dst_fqn": "com.bank.chat.engine.pipeline.EventFilter", + "resolved": true + } + ] + ], + "INJECTS": [ + [ + "4d6ea83c3054e046135b09af4f84c24f62a0d704", + "a0af47515c78a6f764922496fc865c3ab7ba2f59", + { + "_SRC": { + "offset": 165, + "table": 0 + }, + "_DST": { + "offset": 182, + "table": 0 + }, + "_LABEL": "INJECTS", + "_ID": { + "offset": 0, + "table": 10 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/web/ChatManagementController.java", + "dst_name": "ChatManagementService", + "dst_fqn": "com.bank.chat.assign.service.ChatManagementService", + "resolved": true, + "mechanism": "constructor", + "annotation": "", + "field_or_param": "chatManagementService" + } + ], + [ + "1ea660dcf51af165d14d919b274e176cf9a710e4", + "70b6c32695b7438d46a58b22d06d7921be13a21e", + { + "_SRC": { + "offset": 166, + "table": 0 + }, + "_DST": { + "offset": 181, + "table": 0 + }, + "_LABEL": "INJECTS", + "_ID": { + "offset": 1, + "table": 10 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/web/OperatorManagementController.java", + "dst_name": "OperatorSessionService", + "dst_fqn": "com.bank.chat.assign.service.OperatorSessionService", + "resolved": true, + "mechanism": "constructor", + "annotation": "", + "field_or_param": "operatorSessionService" + } + ], + [ + "52a49d3785a9dc6cabf3e3f44f2b3c0ff5730400", + "78911dbd1d5d92491fe3c7db4f7c128c5dd27a56", + { + "_SRC": { + "offset": 167, + "table": 0 + }, + "_DST": { + "offset": 905, + "table": 0 + }, + "_LABEL": "INJECTS", + "_ID": { + "offset": 2, + "table": 10 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/web/dto/OpenSessionResponse.java", + "dst_name": "UUID", + "dst_fqn": "java.util.UUID", + "resolved": false, + "mechanism": "constructor", + "annotation": "", + "field_or_param": "sessionId" + } + ] + ], + "DECLARES": [ + [ + "7f7f0196f0a30df9bdefe8b9110b11823a83dd43", + "3ec7518715fe2cb5adc53a39e63568565784ade6", + { + "_SRC": { + "offset": 159, + "table": 0 + }, + "_DST": { + "offset": 300, + "table": 0 + }, + "_LABEL": "DECLARES", + "_ID": { + "offset": 1, + "table": 12 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/ChatAssignApplication.java" + } + ], + [ + "7f7f0196f0a30df9bdefe8b9110b11823a83dd43", + "1dae8ba1e800d8a1857bf827ce5d1551cff71f2a", + { + "_SRC": { + "offset": 159, + "table": 0 + }, + "_DST": { + "offset": 299, + "table": 0 + }, + "_LABEL": "DECLARES", + "_ID": { + "offset": 0, + "table": 12 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/ChatAssignApplication.java" + } + ], + [ + "51ef46974d13f727df02f0676cf4e71e98fcca6", + "5cb62af6786770e9d1a092f7b617ade0fdefa837", + { + "_SRC": { + "offset": 160, + "table": 0 + }, + "_DST": { + "offset": 304, + "table": 0 + }, + "_LABEL": "DECLARES", + "_ID": { + "offset": 5, + "table": 12 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/config/AssignProperties.java" + } + ] + ], + "OVERRIDES": [ + [ + "82fd431d700c06ee03a5363bb34318f161576245", + "e04896a3cde61f301de034b4ac99ea9300243ee9", + { + "_SRC": { + "offset": 550, + "table": 0 + }, + "_DST": { + "offset": 556, + "table": 0 + }, + "_LABEL": "OVERRIDES", + "_ID": { + "offset": 19, + "table": 14 + }, + "source_file": "chat-core/chat-engine/src/main/java/com/bank/chat/engine/pipeline/ClientSegmentFilter.java" + } + ], + [ + "dbdebee186edbebca462b814cf4915e42b4a7cc9", + "e04896a3cde61f301de034b4ac99ea9300243ee9", + { + "_SRC": { + "offset": 553, + "table": 0 + }, + "_DST": { + "offset": 556, + "table": 0 + }, + "_LABEL": "OVERRIDES", + "_ID": { + "offset": 27, + "table": 14 + }, + "source_file": "chat-core/chat-engine/src/main/java/com/bank/chat/engine/pipeline/EventTypeFilter.java" + } + ], + [ + "78a014fc9824b97e391f0500541628d1fa7a793f", + "10affb1d65bce6f92eab774e4d16a238f042de8", + { + "_SRC": { + "offset": 605, + "table": 0 + }, + "_DST": { + "offset": 612, + "table": 0 + }, + "_LABEL": "OVERRIDES", + "_ID": { + "offset": 14, + "table": 14 + }, + "source_file": "chat-core/chat-engine/src/main/java/com/bank/chat/engine/notification/EmailNotificationSender.java" + } + ] + ], + "CALLS": [ + [ + "1dae8ba1e800d8a1857bf827ce5d1551cff71f2a", + "fe8b772e63e5f8c95dce52fd32792cf832828a62", + { + "_SRC": { + "offset": 299, + "table": 0 + }, + "_DST": { + "offset": 914, + "table": 0 + }, + "_LABEL": "CALLS", + "_ID": { + "offset": 0, + "table": 16 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/ChatAssignApplication.java", + "call_site_line": 12, + "call_site_byte": 336, + "arg_count": 2, + "confidence": 0.95, + "strategy": "import_map", + "source": "static", + "resolved": false, + "callee_declaring_role": "OTHER" + } + ], + [ + "a805ff9fd7051f13a0a17fd6e558b4309629d55b", + "edc937715937310a41ef39028ff36630e87f05b9", + { + "_SRC": { + "offset": 320, + "table": 0 + }, + "_DST": { + "offset": 915, + "table": 0 + }, + "_LABEL": "CALLS", + "_ID": { + "offset": 1, + "table": 16 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/web/ChatManagementController.java", + "call_site_line": 21, + "call_site_byte": 738, + "arg_count": 0, + "confidence": 0.9, + "strategy": "implicit_super", + "source": "static", + "resolved": false, + "callee_declaring_role": "OTHER" + } + ], + [ + "20d6b583bb28a9b84cd289bc46c6615f9348604a", + "8214d23f9241f03a88e438d9833ac1b26a2fd648", + { + "_SRC": { + "offset": 321, + "table": 0 + }, + "_DST": { + "offset": 916, + "table": 0 + }, + "_LABEL": "CALLS", + "_ID": { + "offset": 3, + "table": 16 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/web/ChatManagementController.java", + "call_site_line": 28, + "call_site_byte": 1056, + "arg_count": 0, + "confidence": 0.95, + "strategy": "import_map", + "source": "static", + "resolved": false, + "callee_declaring_role": "OTHER" + } + ] + ], + "UNRESOLVED_AT": [ + [ + "6fa62d6eb64b6fc96824fd597e230c3d07f53757", + "ucs:6fa62d6eb64b6fc96824fd597e230c3d07f53757:14:464", + { + "_SRC": { + "offset": 318, + "table": 0 + }, + "_DST": { + "offset": 0, + "table": 1 + }, + "_LABEL": "UNRESOLVED_AT", + "_ID": { + "offset": 0, + "table": 18 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/config/AssignConfiguration.java" + } + ], + [ + "20d6b583bb28a9b84cd289bc46c6615f9348604a", + "ucs:20d6b583bb28a9b84cd289bc46c6615f9348604a:28:1056", + { + "_SRC": { + "offset": 321, + "table": 0 + }, + "_DST": { + "offset": 1, + "table": 1 + }, + "_LABEL": "UNRESOLVED_AT", + "_ID": { + "offset": 1, + "table": 18 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/web/ChatManagementController.java" + } + ], + [ + "945ab963e7c38edb72d7ad35d22931274293848b", + "ucs:945ab963e7c38edb72d7ad35d22931274293848b:34:1292", + { + "_SRC": { + "offset": 322, + "table": 0 + }, + "_DST": { + "offset": 2, + "table": 1 + }, + "_LABEL": "UNRESOLVED_AT", + "_ID": { + "offset": 2, + "table": 18 + }, + "source_file": "chat-assign/src/main/java/com/bank/chat/assign/web/ChatManagementController.java" + } + ] + ] + } +} diff --git a/tests/test_ast_graph_build.py b/tests/test_ast_graph_build.py index 3751fad..d6579cb 100644 --- a/tests/test_ast_graph_build.py +++ b/tests/test_ast_graph_build.py @@ -17,7 +17,7 @@ import ladybug import pytest -from _builders import build_ladybug_to +from _builders import build_ladybug_into, build_ladybug_to from ast_java import ONTOLOGY_VERSION from graph_enrich import _load_brownfield_overrides, collect_annotation_meta_chain @@ -540,3 +540,157 @@ def test_pass1_parse_incremental_total_excludes_removed_files(tmp_path: Path, ca assert pass1_totals[0] == 1, ( f"incremental pass-1 total must exclude removed files; got {pass1_totals[0]}" ) + + +# --------------------------------------------------------------------------- +# PR-P1: Bulk COPY FROM for _write_edges +# --------------------------------------------------------------------------- + + +def _load_baseline() -> dict: + """Load the baseline fixture generated from the per-row _write_edges implementation.""" + baseline_path = Path(__file__).resolve().parent / "fixtures" / "graph_baseline_bank_chat.json" + with open(baseline_path, encoding="utf-8") as f: + return json.load(f) + + +def test_bulk_write_edges_match_per_row_baseline(ladybug_db_path: Path) -> None: + """Bulk COPY FROM produces identical graph to the per-row baseline. + + Asserts node count, per-type edge counts, GraphMeta counters, and sampled edge + properties match the baseline generated from the last per-row _write_edges build. + """ + baseline = _load_baseline() + conn = _connect(ladybug_db_path) + + # Assert node count matches + node_count = int(conn.execute("MATCH (n:Symbol) RETURN COUNT(n)").get_next()[0]) + assert node_count == baseline["node_count"], f"node count mismatch: {node_count} vs {baseline['node_count']}" + + # Assert edge counts per type match + for edge_type, expected_count in baseline["edge_counts"].items(): + actual_count = int(conn.execute(f"MATCH ()-[r:{edge_type}]->() RETURN COUNT(r)").get_next()[0]) + assert actual_count == expected_count, f"{edge_type} count mismatch: {actual_count} vs {expected_count}" + + # Assert GraphMeta counters match (only PR-P1 edge types) + meta_row = conn.execute("MATCH (m:GraphMeta) RETURN m.ontology_version, m.counts_json").get_next() + assert int(meta_row[0]) == baseline["graph_meta"]["ontology_version"], "ontology_version mismatch" + # Parse both counts_json as LadybugDB-style unquoted JSON for comparison + actual_counts = _parse_ladybug_json(meta_row[1]) + expected_counts = _parse_ladybug_json(baseline["graph_meta"]["counts_json"]) + # Filter to only PR-P1 edge types (routes/clients/producers are PR-P2) + p1_keys = {"packages", "files", "types", "members", "phantoms", "extends", "implements", "injects", "declares", "overrides", "calls"} + actual_counts_p1 = {k: v for k, v in actual_counts.items() if k in p1_keys} + expected_counts_p1 = {k: v for k, v in expected_counts.items() if k in p1_keys} + assert actual_counts_p1 == expected_counts_p1, f"GraphMeta PR-P1 counts mismatch: {actual_counts_p1} vs {expected_counts_p1}" + + # Assert sampled edge properties match (verify CALLS callee_declaring_role is preserved) + for edge_type, sampled_baseline in baseline["sampled_edges"].items(): + result = conn.execute(f"MATCH (a)-[r:{edge_type}]->(b) RETURN a.id, b.id, r LIMIT 3") + actual_rows = [] + while result.has_next(): + actual_rows.append(result.get_next()) + assert len(actual_rows) == len(sampled_baseline), f"{edge_type}: sampled row count mismatch" + # For CALLS, verify callee_declaring_role is preserved (don't compare node IDs as they vary per build) + if edge_type == "CALLS": + for actual, expected in zip(actual_rows, sampled_baseline): + actual_props = actual[2] + expected_props = expected[2] + assert actual_props["callee_declaring_role"] == expected_props["callee_declaring_role"], \ + f"CALLS callee_declaring_role mismatch: {actual_props['callee_declaring_role']} vs {expected_props['callee_declaring_role']}" + + +def test_bulk_write_is_deterministic_double_build(corpus_root: Path, tmp_path: Path) -> None: + """Bulk COPY FROM is deterministic: two builds of the same corpus produce identical graphs. + + Models on tests/test_brownfield_routes.py::test_29_determinism_pass4_route_ids and + tests/test_mcp_v2_compose.py::test_overrides_edge_set_deterministic_double_build. + """ + db1 = tmp_path / "double1.lbug" + db2 = tmp_path / "double2.lbug" + build_ladybug_into(corpus_root, db1) + build_ladybug_into(corpus_root, db2) + + conn1 = _connect(db1) + conn2 = _connect(db2) + + # Assert identical node counts + count1 = int(conn1.execute("MATCH (n:Symbol) RETURN COUNT(n)").get_next()[0]) + count2 = int(conn2.execute("MATCH (n:Symbol) RETURN COUNT(n)").get_next()[0]) + assert count1 == count2, f"node count mismatch: {count1} vs {count2}" + + # Assert identical edge counts per type + for edge_type in ["EXTENDS", "IMPLEMENTS", "INJECTS", "DECLARES", "OVERRIDES", "CALLS", "UNRESOLVED_AT"]: + c1 = int(conn1.execute(f"MATCH ()-[r:{edge_type}]->() RETURN COUNT(r)").get_next()[0]) + c2 = int(conn2.execute(f"MATCH ()-[r:{edge_type}]->() RETURN COUNT(r)").get_next()[0]) + assert c1 == c2, f"{edge_type} count mismatch: {c1} vs {c2}" + + # Assert identical GraphMeta counters + meta1 = conn1.execute("MATCH (m:GraphMeta) RETURN m.counts_json").get_next()[0] + meta2 = conn2.execute("MATCH (m:GraphMeta) RETURN m.counts_json").get_next()[0] + assert meta1 == meta2, "GraphMeta counts_json mismatch" + + # Spot-check: assert identical CALLS callee_declaring_role for a known edge + calls1 = conn1.execute("MATCH (a)-[c:CALLS]->(b) RETURN c.callee_declaring_role LIMIT 1").get_next()[0] + calls2 = conn2.execute("MATCH (a)-[c:CALLS]->(b) RETURN c.callee_declaring_role LIMIT 1").get_next()[0] + assert calls1 == calls2, f"CALLS callee_declaring_role mismatch: {calls1} vs {calls2}" + + +def test_bulk_write_preserves_calls_dedup_and_callee_declaring_role(ladybug_db_path: Path) -> None: + """Bulk COPY FROM preserves CALLS dedup by (src, dst, argc, line) and callee_declaring_role. + + Reuses the @Service callee assertion against a bulk build to verify the materialization + at staging time produces the same results as the per-row path. + """ + conn = _connect(ladybug_db_path) + + # Verify CALLS dedup: count unique (src_id, dst_id, arg_count, call_site_line) tuples + result = conn.execute( + "MATCH (a)-[c:CALLS]->(b) " + "RETURN COUNT(DISTINCT {src: a.id, dst: b.id, argc: c.arg_count, line: c.call_site_line})" + ) + unique_call_keys = int(result.get_next()[0]) + + # Total CALLS count should equal unique keys (dedup applied) + total_calls = int(conn.execute("MATCH ()-[c:CALLS]->() RETURN COUNT(c)").get_next()[0]) + assert unique_call_keys == total_calls, f"CALLS dedup failed: {unique_call_keys} unique keys vs {total_calls} total edges" + + # Verify callee_declaring_role: @Service methods should have callee_declaring_role = "SERVICE" + service_calls = conn.execute( + "MATCH (callee:Symbol)<-[:DECLARES]-(member:Symbol)<-[c:CALLS]-(caller:Symbol) " + "WHERE callee.role = 'SERVICE' " + "RETURN DISTINCT c.callee_declaring_role LIMIT 10" + ) + while service_calls.has_next(): + role = service_calls.get_next()[0] + assert role == "SERVICE", f"@Service callee has unexpected callee_declaring_role: {role}" + + +def test_bulk_write_empty_rel_table_is_noop(tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + """Bulk COPY FROM with empty rows list is a no-op; corpus with no EXTENDS edges should not error.""" + import build_ast_graph + + root = tmp_path / "proj" + java_dir = root / "src/main/java/smoke" + java_dir.mkdir(parents=True) + # Create a corpus with no inheritance (no EXTENDS/IMPLEMENTS edges) + (java_dir / "NoInheritance.java").write_text( + "package smoke;\nclass NoInheritance { void go() { } }\n", encoding="utf-8" + ) + + db_path = tmp_path / "no_inherits.lbug" + tables = build_ast_graph.GraphTables() + asts = build_ast_graph.pass1_parse(root, tables, verbose=False) + build_ast_graph.pass2_edges(tables, asts, verbose=False) + build_ast_graph.pass3_calls(tables, asts, verbose=False) + build_ast_graph.pass4_routes(tables, asts, source_root=root, verbose=False) + build_ast_graph.pass5_imperative_edges(tables, asts, source_root=root, verbose=False) + build_ast_graph.pass6_match_edges(tables, verbose=False) + + # Build via bulk write (should not error on empty EXTENDS) + build_ast_graph.write_ladybug(db_path, tables, source_root=root, verbose=False) + + # Verify EXTENDS table is empty + conn = _connect(db_path) + extends_count = int(conn.execute("MATCH ()-[r:EXTENDS]->() RETURN COUNT(r)").get_next()[0]) + assert extends_count == 0, "EXTENDS should be empty for this corpus" diff --git a/tests/test_schema_consistency.py b/tests/test_schema_consistency.py index a76bb09..7488013 100644 --- a/tests/test_schema_consistency.py +++ b/tests/test_schema_consistency.py @@ -98,7 +98,10 @@ def test_edge_schema_calls_registers_callee_declaring_role() -> None: def test_calls_edge_has_callee_declaring_role_column() -> None: text = _BUILD_AST_GRAPH.read_text(encoding="utf-8") assert "callee_declaring_role STRING" in text - assert "callee_declaring_role: $callee_declaring_role" in text + # PR-P1: CALLS is bulk-loaded via COPY FROM. The column must be declared in the + # schema (above) AND materialized in the bulk write path — either the column + # constant or the staging dict (previously the per-row "$callee_declaring_role" param). + assert '"callee_declaring_role"' in text, "bulk CALLS write must materialize callee_declaring_role" def test_brownfield_resolver_strategy_literals_emitted_in_builder_subset() -> None: