Skip to content

Commit 5cfd4f5

Browse files
committed
test(relationships): add test for standalone file yielding no extra chunks on expansion
1 parent c1fe53a commit 5cfd4f5

File tree

2 files changed

+64
-1
lines changed

2 files changed

+64
-1
lines changed

openrag/components/indexer/vectordb/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,7 @@ def get_file_ancestors(self, partition: str, file_id: str, max_ancestor_depth: i
578578
**(row.file_metadata or {}),
579579
}
580580
for row in result
581+
if row.relationship_id is not None # Only include files that are part of a relationship
581582
]
582583

583584
def get_ancestor_file_ids(self, partition: str, file_id: str, max_ancestor_depth: int | None = None) -> list[str]:

openrag/components/test_relationships.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def get_file_ancestors(self, partition: str, file_id: str, max_ancestor_depth: i
143143
"parent_id": row.parent_id,
144144
}
145145
for row in rows
146+
if row.relationship_id is not None # Only include files that are part of a relationship
146147
]
147148

148149
def get_ancestor_file_ids(self, partition: str, file_id: str, max_ancestor_depth: int | None = None) -> list[str]:
@@ -322,11 +323,12 @@ class TestGetFileAncestors:
322323
"""Test retrieving ancestor chain for a file."""
323324

324325
def test_get_file_ancestors_single_file(self, file_manager):
325-
"""Test that a file with no parent returns only itself."""
326+
"""Test that a file with no parent but with a relationship_id returns only itself."""
326327
file_manager.add_file_to_partition(
327328
partition="test_partition",
328329
file_id="root_email",
329330
file_metadata={"filename": "root.eml"},
331+
relationship_id="thread_single",
330332
)
331333

332334
ancestors = file_manager.get_file_ancestors(
@@ -378,24 +380,28 @@ def test_get_file_ancestors_returns_ordered_path(self, file_manager):
378380
partition="test_partition",
379381
file_id="file_a",
380382
file_metadata={"filename": "a.txt"},
383+
relationship_id="thread_ordered",
381384
)
382385
file_manager.add_file_to_partition(
383386
partition="test_partition",
384387
file_id="file_b",
385388
file_metadata={"filename": "b.txt"},
386389
parent_id="file_a",
390+
relationship_id="thread_ordered",
387391
)
388392
file_manager.add_file_to_partition(
389393
partition="test_partition",
390394
file_id="file_c",
391395
file_metadata={"filename": "c.txt"},
392396
parent_id="file_b",
397+
relationship_id="thread_ordered",
393398
)
394399
file_manager.add_file_to_partition(
395400
partition="test_partition",
396401
file_id="file_d",
397402
file_metadata={"filename": "d.txt"},
398403
parent_id="file_c",
404+
relationship_id="thread_ordered",
399405
)
400406

401407
ancestors = file_manager.get_file_ancestors(
@@ -424,12 +430,14 @@ def test_get_ancestor_file_ids(self, file_manager):
424430
partition="test_partition",
425431
file_id="parent_file",
426432
file_metadata={"filename": "parent.txt"},
433+
relationship_id="thread_ids",
427434
)
428435
file_manager.add_file_to_partition(
429436
partition="test_partition",
430437
file_id="child_file",
431438
file_metadata={"filename": "child.txt"},
432439
parent_id="parent_file",
440+
relationship_id="thread_ids",
433441
)
434442

435443
ancestor_ids = file_manager.get_ancestor_file_ids(
@@ -447,13 +455,15 @@ def test_get_file_ancestors_max_ancestor_depth_none_returns_all(self, file_manag
447455
partition="test_partition",
448456
file_id="level_0",
449457
file_metadata={"filename": "root.txt"},
458+
relationship_id="thread_depth_none",
450459
)
451460
for i in range(1, 6):
452461
file_manager.add_file_to_partition(
453462
partition="test_partition",
454463
file_id=f"level_{i}",
455464
file_metadata={"filename": f"level_{i}.txt"},
456465
parent_id=f"level_{i - 1}",
466+
relationship_id="thread_depth_none",
457467
)
458468

459469
# Without max_ancestor_depth (None), should return all 6 levels
@@ -474,13 +484,15 @@ def test_get_file_ancestors_max_ancestor_depth_limits_traversal(self, file_manag
474484
partition="test_partition",
475485
file_id="node_0",
476486
file_metadata={"filename": "root.txt"},
487+
relationship_id="thread_depth_limit",
477488
)
478489
for i in range(1, 6):
479490
file_manager.add_file_to_partition(
480491
partition="test_partition",
481492
file_id=f"node_{i}",
482493
file_metadata={"filename": f"node_{i}.txt"},
483494
parent_id=f"node_{i - 1}",
495+
relationship_id="thread_depth_limit",
484496
)
485497

486498
# With max_ancestor_depth=2, should return target (depth 0) + 2 ancestors
@@ -501,12 +513,14 @@ def test_get_file_ancestors_max_ancestor_depth_zero_returns_only_target(self, fi
501513
partition="test_partition",
502514
file_id="root",
503515
file_metadata={"filename": "root.txt"},
516+
relationship_id="thread_depth_zero",
504517
)
505518
file_manager.add_file_to_partition(
506519
partition="test_partition",
507520
file_id="child",
508521
file_metadata={"filename": "child.txt"},
509522
parent_id="root",
523+
relationship_id="thread_depth_zero",
510524
)
511525

512526
# max_ancestor_depth=0 means no traversal beyond the target
@@ -527,18 +541,21 @@ def test_get_file_ancestors_max_ancestor_depth_exceeds_chain_length(self, file_m
527541
partition="test_partition",
528542
file_id="short_0",
529543
file_metadata={"filename": "a.txt"},
544+
relationship_id="thread_short",
530545
)
531546
file_manager.add_file_to_partition(
532547
partition="test_partition",
533548
file_id="short_1",
534549
file_metadata={"filename": "b.txt"},
535550
parent_id="short_0",
551+
relationship_id="thread_short",
536552
)
537553
file_manager.add_file_to_partition(
538554
partition="test_partition",
539555
file_id="short_2",
540556
file_metadata={"filename": "c.txt"},
541557
parent_id="short_1",
558+
relationship_id="thread_short",
542559
)
543560

544561
# max_ancestor_depth=100 but chain is only 3 levels
@@ -560,13 +577,15 @@ def test_get_ancestor_file_ids_with_max_ancestor_depth(self, file_manager):
560577
partition="test_partition",
561578
file_id="chain_0",
562579
file_metadata={"filename": "a.txt"},
580+
relationship_id="thread_chain",
563581
)
564582
for i in range(1, 4):
565583
file_manager.add_file_to_partition(
566584
partition="test_partition",
567585
file_id=f"chain_{i}",
568586
file_metadata={"filename": f"{chr(97 + i)}.txt"},
569587
parent_id=f"chain_{i - 1}",
588+
relationship_id="thread_chain",
570589
)
571590

572591
# With max_ancestor_depth=1, should get target + 1 ancestor
@@ -580,6 +599,49 @@ def test_get_ancestor_file_ids_with_max_ancestor_depth(self, file_manager):
580599
assert ancestor_ids == ["chain_2", "chain_3"]
581600

582601

602+
class TestStandaloneFileNoExpansion:
603+
"""Test that a file indexed without relationship_id yields no additional chunks
604+
when include_related and include_ancestors are both active."""
605+
606+
def test_no_extra_chunks_for_file_without_relationship_id(self, file_manager):
607+
"""A standalone file (no relationship_id, no parent_id) must not bring
608+
additional files when both include_related and include_ancestors are activated.
609+
610+
Mirrors the logic in _expand_with_related_chunks:
611+
- include_related: the guard `metadata.get("relationship_id")` is falsy,
612+
so no related lookup is issued and the related task set stays empty.
613+
- include_ancestors: get_file_ancestors returns only the file itself when
614+
there is no parent, so it is already in seen_ids — nothing new is added.
615+
"""
616+
file_manager.add_file_to_partition(
617+
partition="test_partition",
618+
file_id="standalone",
619+
file_metadata={"filename": "standalone.pdf"},
620+
# No relationship_id, no parent_id
621+
)
622+
623+
# Verify the file has no relationship_id (the falsy guard that prevents
624+
# the include_related lookup from being issued at all).
625+
files = file_manager.get_files_by_relationship(
626+
partition="test_partition",
627+
relationship_id="standalone", # non-existent → empty
628+
)
629+
assert files == [], "No files should share a relationship with a standalone file"
630+
631+
with file_manager.Session() as session:
632+
row = session.execute(text("SELECT relationship_id FROM files WHERE file_id = 'standalone'")).fetchone()
633+
assert not row[0], "relationship_id must be falsy so include_related is skipped"
634+
635+
ancestors = file_manager.get_file_ancestors(
636+
partition="test_partition",
637+
file_id="standalone",
638+
)
639+
assert len(ancestors) == 0, (
640+
"Standalone file has no relationship_id, so ancestor list must be empty — "
641+
"the relationship_id filter in get_file_ancestors excludes it"
642+
)
643+
644+
583645
class TestFileModelFields:
584646
"""Test that File model correctly handles relationship fields."""
585647

0 commit comments

Comments
 (0)