Skip to content

Commit 1089414

Browse files
authored
Update docs_to_kg example to add Document node and MENTION rel (#318)
1 parent 762031a commit 1089414

File tree

3 files changed

+67
-10
lines changed

3 files changed

+67
-10
lines changed

examples/docs_to_kg/main.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55
from dotenv import load_dotenv
66
import cocoindex
77

8+
@dataclasses.dataclass
9+
class DocumentSummary:
10+
"""Describe a summary of a document."""
11+
title: str
12+
summary: str
813

914
@dataclasses.dataclass
1015
class Relationship:
@@ -31,13 +36,25 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
3136
cocoindex.sources.LocalFile(path="../../docs/docs/core",
3237
included_patterns=["*.md", "*.mdx"]))
3338

34-
relationships = data_scope.add_collector()
39+
document_node = data_scope.add_collector()
40+
entity_relationship = data_scope.add_collector()
41+
entity_mention = data_scope.add_collector()
3542

3643
with data_scope["documents"].row() as doc:
3744
doc["chunks"] = doc["content"].transform(
3845
cocoindex.functions.SplitRecursively(),
3946
language="markdown", chunk_size=10000)
4047

48+
doc["summary"] = doc["content"].transform(
49+
cocoindex.functions.ExtractByLlm(
50+
llm_spec=cocoindex.LlmSpec(
51+
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
52+
output_type=DocumentSummary,
53+
instruction="Please summarize the content of the document."))
54+
document_node.collect(
55+
filename=doc["filename"], title=doc["summary"]["title"],
56+
summary=doc["summary"]["summary"])
57+
4158
with doc["chunks"].row() as chunk:
4259
chunk["relationships"] = chunk["text"].transform(
4360
cocoindex.functions.ExtractByLlm(
@@ -59,17 +76,31 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
5976
relationship["object_embedding"] = relationship["object"].transform(
6077
cocoindex.functions.SentenceTransformerEmbed(
6178
model="sentence-transformers/all-MiniLM-L6-v2"))
62-
relationships.collect(
79+
entity_relationship.collect(
6380
id=cocoindex.GeneratedField.UUID,
6481
subject=relationship["subject"],
6582
subject_embedding=relationship["subject_embedding"],
6683
object=relationship["object"],
6784
object_embedding=relationship["object_embedding"],
6885
predicate=relationship["predicate"],
6986
)
70-
71-
relationships.export(
72-
"relationships",
87+
entity_mention.collect(
88+
id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
89+
filename=doc["filename"], location=chunk["location"],
90+
)
91+
entity_mention.collect(
92+
id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
93+
filename=doc["filename"], location=chunk["location"],
94+
)
95+
document_node.export(
96+
"document_node",
97+
cocoindex.storages.Neo4j(
98+
connection=conn_spec,
99+
mapping=cocoindex.storages.Neo4jNode(label="Document")),
100+
primary_key_fields=["filename"],
101+
)
102+
entity_relationship.export(
103+
"entity_relationship",
73104
cocoindex.storages.Neo4j(
74105
connection=conn_spec,
75106
mapping=cocoindex.storages.Neo4jRelationship(
@@ -107,6 +138,25 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
107138
),
108139
primary_key_fields=["id"],
109140
)
141+
entity_mention.export(
142+
"entity_mention",
143+
cocoindex.storages.Neo4j(
144+
connection=conn_spec,
145+
mapping=cocoindex.storages.Neo4jRelationship(
146+
rel_type="MENTION",
147+
source=cocoindex.storages.Neo4jRelationshipEnd(
148+
label="Document",
149+
fields=[cocoindex.storages.Neo4jFieldMapping("filename")],
150+
),
151+
target=cocoindex.storages.Neo4jRelationshipEnd(
152+
label="Entity",
153+
fields=[cocoindex.storages.Neo4jFieldMapping(
154+
field_name="entity", node_field_name="value")],
155+
),
156+
),
157+
),
158+
primary_key_fields=["id"],
159+
)
110160

111161
@cocoindex.main_fn()
112162
def _run():

python/cocoindex/storages.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class Neo4jRelationship:
6363
rel_type: str
6464
source: Neo4jRelationshipEnd
6565
target: Neo4jRelationshipEnd
66-
nodes: dict[str, Neo4jRelationshipNode]
66+
nodes: dict[str, Neo4jRelationshipNode] | None = None
6767

6868
class Neo4j(op.StorageSpec):
6969
"""Graph storage powered by Neo4j."""

src/ops/storages/neo4j.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ pub struct RelationshipSpec {
5656
rel_type: String,
5757
source: RelationshipEndSpec,
5858
target: RelationshipEndSpec,
59-
nodes: BTreeMap<String, RelationshipNodeSpec>,
59+
nodes: Option<BTreeMap<String, RelationshipNodeSpec>>,
6060
}
6161

6262
#[derive(Debug, Deserialize)]
@@ -693,7 +693,7 @@ impl RelationshipSetupState {
693693
rel_spec.rel_type
694694
)
695695
})?;
696-
for (label, node) in rel_spec.nodes.iter() {
696+
for (label, node) in rel_spec.nodes.iter().flatten() {
697697
sub_components.push(ComponentState {
698698
object_label: ElementType::Node(label.clone()),
699699
index_def: IndexDef::KeyConstraint {
@@ -720,7 +720,13 @@ impl RelationshipSetupState {
720720
});
721721
}
722722
}
723-
dependent_node_labels.extend(rel_spec.nodes.keys().cloned());
723+
dependent_node_labels.extend(
724+
rel_spec
725+
.nodes
726+
.iter()
727+
.flat_map(|nodes| nodes.keys())
728+
.cloned(),
729+
);
724730
}
725731
};
726732
Ok(Self {
@@ -1069,7 +1075,8 @@ impl<'a> DependentNodeLabelAnalyzer<'a> {
10691075
.collect(),
10701076
index_options: rel_spec
10711077
.nodes
1072-
.get(&rel_end_spec.label)
1078+
.as_ref()
1079+
.and_then(|nodes| nodes.get(&rel_end_spec.label))
10731080
.and_then(|node_spec| Some(&node_spec.index_options)),
10741081
})
10751082
}

0 commit comments

Comments
 (0)