Skip to content

Commit 1218420

Browse files
authored
simplify neo4j example (#399)
1 parent deeef7e commit 1218420

File tree

1 file changed

+33
-49
lines changed
  • examples/docs_to_knowledge_graph

1 file changed

+33
-49
lines changed

examples/docs_to_knowledge_graph/main.py

Lines changed: 33 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
This example shows how to extract relationships from Markdown documents and build a knowledge graph.
2+
This example shows how to extract relationships from documents and build a knowledge graph.
33
"""
44
import dataclasses
55
from dotenv import load_dotenv
@@ -13,17 +13,17 @@ class DocumentSummary:
1313

1414
@dataclasses.dataclass
1515
class Relationship:
16-
"""Describe a relationship between two nodes."""
16+
"""Describe a relationship between two entities."""
1717
subject: str
1818
predicate: str
1919
object: str
2020

2121
@cocoindex.flow_def(name="DocsToKG")
2222
def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
2323
"""
24-
Define an example flow that extracts triples from files and build knowledge graph.
24+
Define an example flow that extracts relationship from files and build knowledge graph.
2525
"""
26-
26+
# configure neo4j connection
2727
conn_spec = cocoindex.add_auth_entry(
2828
"Neo4jConnection",
2929
cocoindex.storages.Neo4jConnection(
@@ -41,78 +41,66 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
4141
entity_mention = data_scope.add_collector()
4242

4343
with data_scope["documents"].row() as doc:
44-
doc["chunks"] = doc["content"].transform(
45-
cocoindex.functions.SplitRecursively(),
46-
language="markdown", chunk_size=10000)
47-
44+
# extract summary from document
4845
doc["summary"] = doc["content"].transform(
4946
cocoindex.functions.ExtractByLlm(
5047
llm_spec=cocoindex.LlmSpec(
48+
# Supported LLM: https://cocoindex.io/docs/ai/llm
5149
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
5250
output_type=DocumentSummary,
5351
instruction="Please summarize the content of the document."))
5452
document_node.collect(
5553
filename=doc["filename"], title=doc["summary"]["title"],
5654
summary=doc["summary"]["summary"])
5755

58-
with doc["chunks"].row() as chunk:
59-
chunk["relationships"] = chunk["text"].transform(
60-
cocoindex.functions.ExtractByLlm(
61-
llm_spec=cocoindex.LlmSpec(
62-
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
63-
# Replace by this spec below, to use Ollama API instead of OpenAI
64-
# llm_spec=cocoindex.LlmSpec(
65-
# api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
56+
# extract relationships from document
57+
doc["relationships"] = doc["content"].transform(
58+
cocoindex.functions.ExtractByLlm(
59+
llm_spec=cocoindex.LlmSpec(
60+
# Supported LLM: https://cocoindex.io/docs/ai/llm
61+
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
6662
output_type=list[Relationship],
6763
instruction=(
6864
"Please extract relationships from CocoIndex documents. "
6965
"Focus on concepts and ingnore specific examples. "
7066
"Each relationship should be a tuple of (subject, predicate, object).")))
7167

72-
with chunk["relationships"].row() as relationship:
73-
relationship["subject_embedding"] = relationship["subject"].transform(
74-
cocoindex.functions.SentenceTransformerEmbed(
75-
model="sentence-transformers/all-MiniLM-L6-v2"))
76-
relationship["object_embedding"] = relationship["object"].transform(
77-
cocoindex.functions.SentenceTransformerEmbed(
78-
model="sentence-transformers/all-MiniLM-L6-v2"))
79-
entity_relationship.collect(
80-
id=cocoindex.GeneratedField.UUID,
81-
subject=relationship["subject"],
82-
subject_embedding=relationship["subject_embedding"],
83-
object=relationship["object"],
84-
object_embedding=relationship["object_embedding"],
85-
predicate=relationship["predicate"],
86-
)
87-
entity_mention.collect(
88-
id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
89-
filename=doc["filename"], location=chunk["location"],
90-
)
91-
entity_mention.collect(
92-
id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
93-
filename=doc["filename"], location=chunk["location"],
94-
)
68+
with doc["relationships"].row() as relationship:
69+
# relationship between two entities
70+
entity_relationship.collect(
71+
id=cocoindex.GeneratedField.UUID,
72+
subject=relationship["subject"],
73+
object=relationship["object"],
74+
predicate=relationship["predicate"],
75+
)
76+
# mention of an entity in a document, for subject
77+
entity_mention.collect(
78+
id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
79+
filename=doc["filename"],
80+
)
81+
# mention of an entity in a document, for object
82+
entity_mention.collect(
83+
id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
84+
filename=doc["filename"],
85+
)
86+
9587

88+
# export to neo4j
9689
document_node.export(
9790
"document_node",
9891
cocoindex.storages.Neo4j(
9992
connection=conn_spec,
10093
mapping=cocoindex.storages.NodeMapping(label="Document")),
10194
primary_key_fields=["filename"],
10295
)
96+
# Declare reference Node to reference entity node in a relationship
10397
flow_builder.declare(
10498
cocoindex.storages.Neo4jDeclarations(
10599
connection=conn_spec,
106100
referenced_nodes=[
107101
cocoindex.storages.ReferencedNode(
108102
label="Entity",
109103
primary_key_fields=["value"],
110-
vector_indexes=[
111-
cocoindex.VectorIndexDef(
112-
field_name="embedding",
113-
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
114-
),
115-
],
116104
)
117105
]
118106
)
@@ -128,17 +116,13 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
128116
fields=[
129117
cocoindex.storages.TargetFieldMapping(
130118
source="subject", target="value"),
131-
cocoindex.storages.TargetFieldMapping(
132-
source="subject_embedding", target="embedding"),
133119
]
134120
),
135121
target=cocoindex.storages.NodeReferenceMapping(
136122
label="Entity",
137123
fields=[
138124
cocoindex.storages.TargetFieldMapping(
139125
source="object", target="value"),
140-
cocoindex.storages.TargetFieldMapping(
141-
source="object_embedding", target="embedding"),
142126
]
143127
),
144128
),

0 commit comments

Comments
 (0)