Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 81 additions & 21 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,12 @@ class TypeIndexEntry:
package: str
outer_fqn: str | None
node_id: str
# True when this entry was loaded from the existing graph by
# `_load_existing_types` (an unchanged-file stub used only for cross-file
# resolution). Its `decl` is a placeholder (no annotations/methods), so its
# recomputed role/capabilities must never be written back over the real
# stored values. See `_write_nodes_impl`.
loaded_from_db: bool = False


@dataclass
Expand All @@ -200,6 +206,11 @@ class MemberEntry:
module: str
microservice: str
node_id: str
# True when loaded from the existing graph by `_load_existing_members`
# (an unchanged-file stub used only for cross-file call resolution). Its
# DECLARES edge already persists in the graph, so it must not be re-emitted
# by `_populate_declares_rows` (REL tables have no PK → would duplicate).
loaded_from_db: bool = False


@dataclass
Expand Down Expand Up @@ -556,7 +567,7 @@ def _load_existing_types(conn: ladybug.Connection, tables: GraphTables, exclude_
if exclude_files is not None and not exclude_files:
return

where = "WHERE s.kind IN ['class', 'interface', 'enum', 'annotation', 'record']"
where = f"WHERE s.kind IN {list(_TYPE_KINDS)}"
params: dict = {}
if exclude_files:
where += "\n AND NOT (s.filename IN $exclude_files)"
Expand Down Expand Up @@ -586,6 +597,7 @@ def _load_existing_types(conn: ladybug.Connection, tables: GraphTables, exclude_
package=package,
outer_fqn=None,
node_id=node_id,
loaded_from_db=True,
)
tables.types[fqn] = entry
tables.by_simple_name.setdefault(name, []).append(entry)
Expand Down Expand Up @@ -634,6 +646,7 @@ def _load_existing_members(conn: ladybug.Connection, tables: GraphTables, exclud
module="",
microservice="",
node_id=node_id,
loaded_from_db=True,
))


Expand Down Expand Up @@ -3044,19 +3057,6 @@ def _existing_node_ids(conn: ladybug.Connection) -> set[str]:
return ids


def _existing_symbol_ids(conn: ladybug.Connection) -> set[str]:
"""Return every Symbol node id currently in the graph.

Deprecated: use _existing_node_ids for filtering all node types.
Kept for compatibility with existing _write_edges implementation.
"""
result = conn.execute("MATCH (n:Symbol) RETURN n.id")
ids: set[str] = set()
while result.has_next():
ids.add(result.get_next()[0])
return ids


# Column-order constants for bulk COPY FROM.
# For REL tables, the first two entries are FROM/TO node primary keys (kuzu requirement).
# Order matches the corresponding _SCHEMA_* declarations above.
Expand All @@ -3066,6 +3066,29 @@ def _existing_symbol_ids(conn: ladybug.Connection) -> set[str]:
"modifiers", "annotations", "capabilities", "role", "signature", "parent_id", "resolved"
]

# Type declaration kinds. Tuple (not set) so the rendered SQL `IN` clause is
# deterministic. Used to (a) load type stubs for cross-file resolution and
# (b) scope the incremental property-refresh SET to type nodes.
_TYPE_KINDS: tuple[str, ...] = ("class", "interface", "enum", "annotation", "record")

# Update every mutable Symbol field on an existing node by primary key. Used on
# the incremental path to refresh preserved dependent type nodes whose
# `role`/`capabilities` (and other project-wide-derived fields) can shift
# without their own source changing — restoring the upsert the legacy per-row
# `MERGE (n:Symbol {id:$id}) SET …` provided. Field list mirrors `_NODE_COLUMNS`
# minus `id`.
_SET_SYMBOL_BY_ID = (
"MATCH (n:Symbol {id: $id}) "
"SET n.kind = $kind, n.name = $name, n.fqn = $fqn, "
"n.package = $package, n.module = $module, n.microservice = $microservice, "
"n.filename = $filename, "
"n.start_line = $start_line, n.end_line = $end_line, "
"n.start_byte = $start_byte, n.end_byte = $end_byte, "
"n.modifiers = $modifiers, n.annotations = $annotations, "
"n.capabilities = $capabilities, n.role = $role, "
"n.signature = $signature, n.parent_id = $parent_id, n.resolved = $resolved"
)

_REL_EXTENDS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"]
_REL_IMPLEMENTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"]
_REL_INJECTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved", "mechanism", "annotation", "field_or_param"]
Expand Down Expand Up @@ -3106,6 +3129,10 @@ def _write_nodes_impl(

# Stage all Symbol rows
rows: list[dict] = []
# Node ids loaded from the existing graph as resolution-only stubs
# (`_load_existing_types`); their staged rows carry placeholder values and
# must never be written back over the real nodes.
stub_ids: set[str] = set()

# packages
for pkg, pid in tables.packages.items():
Expand All @@ -3119,6 +3146,8 @@ def _write_nodes_impl(
))
# types
for entry in tables.types.values():
if entry.loaded_from_db:
stub_ids.add(entry.node_id)
d = entry.decl
role, capabilities = resolve_role_and_capabilities(
d,
Expand Down Expand Up @@ -3158,14 +3187,34 @@ def _write_nodes_impl(
for pid, row in tables.phantoms.items():
rows.append(row)

# For incremental path, filter out nodes that already exist to avoid duplicate primary key errors
# The full rebuild path starts with an empty database, so all rows are new
# Bulk-load new Symbol rows. The full-rebuild path starts from an empty
# database (`_drop_all`), so every row is new. The incremental path reaches
# here with a populated database: changed-file nodes were deleted by
# `_delete_file_scope` (absent here → new), while dependent-file nodes are
# deliberately preserved (see `_delete_file_scope` / issue #305).
existing_ids = _existing_node_ids(conn)
new_rows = [row for row in rows if row["id"] not in existing_ids]

# Bulk-load only new Symbol rows
_bulk_copy(conn, "Symbol", _NODE_COLUMNS, new_rows)

# Refresh mutable properties on preserved dependent TYPE nodes (incremental
# path only; `update_rows` is empty on the full path). `role`/`capabilities`
# — and any other field derived from project-wide inputs (meta-annotation
# chain, brownfield overrides) — can shift without the type's own source
# changing, so a preserved dependent must be re-SET to stay byte-equivalent
# with a full rebuild. The legacy per-row `_MERGE_SYMBOL` upserted every
# staged node and did this implicitly; bulk `COPY FROM` only appends, so the
# SET is explicit here. Stubs (`stub_ids`) are skipped: their decl is a
# placeholder and their stored values are authoritative. Non-type kinds
# carry no mutable role/capabilities, so they are skipped too.
update_rows = [
row for row in rows
if row["id"] in existing_ids
and row["id"] not in stub_ids
and row["kind"] in _TYPE_KINDS
]
for row in update_rows:
conn.execute(_SET_SYMBOL_BY_ID, row)


def _write_nodes(
conn: ladybug.Connection,
Expand All @@ -3180,8 +3229,15 @@ def _write_nodes(


def _populate_declares_rows(tables: GraphTables) -> None:
# Skip members loaded from the existing graph for cross-file resolution: a
# DECLARES edge for an unchanged-file member already persists (its
# source_file is out of scope, so `_delete_file_scope` left it), and
# re-emitting it would append a duplicate (REL tables carry no primary key).
# Full-rebuild never loads members, so this is a no-op there.
tables.declares_rows = [
DeclaresRow(src_id=m.parent_id, dst_id=m.node_id) for m in tables.members
DeclaresRow(src_id=m.parent_id, dst_id=m.node_id)
for m in tables.members
if not m.loaded_from_db
]


Expand Down Expand Up @@ -3884,8 +3940,12 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa
Route nodes (created by pass5 for cross-service calls) that wouldn't
otherwise exist in LadybugDB.
"""
# Write phantom routes that don't already exist (pass5 creates these for cross-service calls)
# Intentionally retained MERGE for dedup against routes written during scoped step
# Upsert every route via MERGE. `tables.routes_rows` is the full route set
# (pass4 routes + pass5 phantom routes), not just phantoms; MERGE is
# idempotent against routes already written during the scoped step, so it
# neither duplicates nor drops them. This is the one remaining per-row graph
# write — converting it to bulk COPY requires filtering against existing
# route ids to reproduce the dedup, and is left as a future optimization.
for row in tables.routes_rows:
conn.execute(
"MERGE (r:Route {id: $id}) "
Expand Down
2 changes: 1 addition & 1 deletion plans/AGENT-PROMPTS-INIT-INCREMENT-PERF.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ lines vs the pre-PR baseline.
- [ ] Step-1 spike result recorded in `_bulk_copy` docstring.
- [ ] `_write_edges` stages per-type rows (CALLS dedup + callee_declaring_role at staging); UnresolvedCallSite bulk-loaded before UNRESOLVED_AT.
- [ ] `_CREATE_EXT/IMPL/INJ/DECL/OVERRIDES/CALL` + local `_CREATE_UNRESOLVED/_UNRESOLVED_AT` deleted.
- [ ] `test_bulk_write_edges_match_per_row_baseline`, `test_bulk_write_is_deterministic_double_build`, `test_bulk_write_preserves_calls_dedup_and_callee_declaring_role`, `test_bulk_write_empty_rel_table_is_noop` pass.
- [ ] `test_bank_chat_bulk_build_matches_committed_baseline` (renamed from `test_bulk_write_edges_match_per_row_baseline` in PR-P4), `test_bulk_write_is_deterministic_double_build`, `test_bulk_write_preserves_calls_dedup_and_callee_declaring_role`, `test_bulk_write_empty_rel_table_is_noop` pass.
- [ ] Full `test_ast_graph_build.py` + `test_incremental_graph.py` pass unchanged.
- [ ] Sentinel greps: zero where required, non-zero where required.
- [ ] `.venv/bin/ruff check .` clean; benchmark in PR description.
Expand Down
11 changes: 7 additions & 4 deletions plans/active/PLAN-INIT-INCREMENT-PERF.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,12 @@ phase delta. Not packaged; documents the measured speedup in the PR description.

## Tests for PR-P1

1. `test_bulk_write_edges_match_per_row_baseline` — build `tests/bank-chat-system`
via the bulk path, assert node count, per-type edge counts, `GraphMeta`
counters, and sampled edge rows equal `graph_baseline_bank_chat.json`.
1. `test_bank_chat_bulk_build_matches_committed_baseline` (renamed from
`test_bulk_write_edges_match_per_row_baseline` in PR-P4 once the per-row
reference was gone) — build `tests/bank-chat-system` via the bulk path,
assert node count, per-type edge counts, `GraphMeta` counters, and sampled
edge rows equal `graph_baseline_bank_chat.json` (a drift anchor, not a
per-row equivalence proof).
2. `test_bulk_write_is_deterministic_double_build` — build bank-chat twice to two
DBs via the bulk path, assert identical counts + query battery. Models on
`tests/test_brownfield_routes.py::test_29_determinism_pass4_route_ids` and
Expand All @@ -183,7 +186,7 @@ phase delta. Not packaged; documents the measured speedup in the PR description.
- [ ] `_bulk_copy` helper added; step-1 spike result in its docstring.
- [ ] `_write_edges` stages per-type rows (CALLS dedup + `callee_declaring_role` at staging) and bulk-loads UnresolvedCallSite before UNRESOLVED_AT.
- [ ] `_CREATE_EXT/IMPL/INJ/DECL/OVERRIDES/CALL` deleted; local `_CREATE_UNRESOLVED/_UNRESOLVED_AT` gone with the rewrite.
- [ ] `test_bulk_write_edges_match_per_row_baseline`,
- [ ] `test_bank_chat_bulk_build_matches_committed_baseline`,
`test_bulk_write_is_deterministic_double_build`,
`test_bulk_write_preserves_calls_dedup_and_callee_declaring_role`,
`test_bulk_write_empty_rel_table_is_noop` pass.
Expand Down
19 changes: 14 additions & 5 deletions tests/test_ast_graph_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,11 +554,20 @@ def _load_baseline() -> dict:
return json.load(f)


def test_bulk_write_edges_match_per_row_baseline(ladybug_db_path: Path) -> None:
"""Bulk COPY FROM produces identical graph to the per-row baseline.

Asserts node count, per-type edge counts, GraphMeta counters, and sampled edge
properties match the baseline generated from the last per-row _write_edges build.
def test_bank_chat_bulk_build_matches_committed_baseline(ladybug_db_path: Path) -> None:
"""Bank-chat full build matches the committed baseline (a drift anchor).

Asserts node count, per-type edge counts, GraphMeta counters, and sampled
CALLS properties match ``graph_baseline_bank_chat.json``.

This is a **regression anchor**, not a per-row equivalence proof: the legacy
per-row ``_write_edges`` was removed in PR-P1, so there is no per-row
reference to compare against, and this fixture was itself generated from a
bulk build (commit 8261acf). It guards against unintended drift in the
bank-chat full-build graph. The actual write-mechanism equivalence gates are
``test_bulk_write_is_deterministic_double_build`` (two bulk builds identical)
and ``test_incremental_bulk_write_equivalent_to_full_rebuild`` (incremental
matches a full rebuild of the same state).
"""
baseline = _load_baseline()
conn = _connect(ladybug_db_path)
Expand Down
Loading
Loading