11"""
2- This example shows how to extract relationships from Markdown documents and build a knowledge graph.
2+ This example shows how to extract relationships from documents and build a knowledge graph.
33"""
44import dataclasses
55from dotenv import load_dotenv
@@ -13,17 +13,17 @@ class DocumentSummary:
1313
1414@dataclasses .dataclass
1515class Relationship :
16- """Describe a relationship between two nodes ."""
16+ """Describe a relationship between two entities ."""
1717 subject : str
1818 predicate : str
1919 object : str
2020
2121@cocoindex .flow_def (name = "DocsToKG" )
2222def docs_to_kg_flow (flow_builder : cocoindex .FlowBuilder , data_scope : cocoindex .DataScope ):
2323 """
24- Define an example flow that extracts triples from files and build knowledge graph.
24+ Define an example flow that extracts relationship from files and build knowledge graph.
2525 """
26-
26+ # configure neo4j connection
2727 conn_spec = cocoindex .add_auth_entry (
2828 "Neo4jConnection" ,
2929 cocoindex .storages .Neo4jConnection (
@@ -41,78 +41,66 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
4141 entity_mention = data_scope .add_collector ()
4242
4343 with data_scope ["documents" ].row () as doc :
44- doc ["chunks" ] = doc ["content" ].transform (
45- cocoindex .functions .SplitRecursively (),
46- language = "markdown" , chunk_size = 10000 )
47-
44+ # extract summary from document
4845 doc ["summary" ] = doc ["content" ].transform (
4946 cocoindex .functions .ExtractByLlm (
5047 llm_spec = cocoindex .LlmSpec (
48+ # Supported LLM: https://cocoindex.io/docs/ai/llm
5149 api_type = cocoindex .LlmApiType .OPENAI , model = "gpt-4o" ),
5250 output_type = DocumentSummary ,
5351 instruction = "Please summarize the content of the document." ))
5452 document_node .collect (
5553 filename = doc ["filename" ], title = doc ["summary" ]["title" ],
5654 summary = doc ["summary" ]["summary" ])
5755
58- with doc ["chunks" ].row () as chunk :
59- chunk ["relationships" ] = chunk ["text" ].transform (
60- cocoindex .functions .ExtractByLlm (
61- llm_spec = cocoindex .LlmSpec (
62- api_type = cocoindex .LlmApiType .OPENAI , model = "gpt-4o" ),
63- # Replace by this spec below, to use Ollama API instead of OpenAI
64- # llm_spec=cocoindex.LlmSpec(
65- # api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
56+ # extract relationships from document
57+ doc ["relationships" ] = doc ["content" ].transform (
58+ cocoindex .functions .ExtractByLlm (
59+ llm_spec = cocoindex .LlmSpec (
60+ # Supported LLM: https://cocoindex.io/docs/ai/llm
61+ api_type = cocoindex .LlmApiType .OPENAI , model = "gpt-4o" ),
6662 output_type = list [Relationship ],
6763 instruction = (
6864 "Please extract relationships from CocoIndex documents. "
6965 "Focus on concepts and ingnore specific examples. "
7066 "Each relationship should be a tuple of (subject, predicate, object)." )))
7167
72- with chunk ["relationships" ].row () as relationship :
73- relationship ["subject_embedding" ] = relationship ["subject" ].transform (
74- cocoindex .functions .SentenceTransformerEmbed (
75- model = "sentence-transformers/all-MiniLM-L6-v2" ))
76- relationship ["object_embedding" ] = relationship ["object" ].transform (
77- cocoindex .functions .SentenceTransformerEmbed (
78- model = "sentence-transformers/all-MiniLM-L6-v2" ))
79- entity_relationship .collect (
80- id = cocoindex .GeneratedField .UUID ,
81- subject = relationship ["subject" ],
82- subject_embedding = relationship ["subject_embedding" ],
83- object = relationship ["object" ],
84- object_embedding = relationship ["object_embedding" ],
85- predicate = relationship ["predicate" ],
86- )
87- entity_mention .collect (
88- id = cocoindex .GeneratedField .UUID , entity = relationship ["subject" ],
89- filename = doc ["filename" ], location = chunk ["location" ],
90- )
91- entity_mention .collect (
92- id = cocoindex .GeneratedField .UUID , entity = relationship ["object" ],
93- filename = doc ["filename" ], location = chunk ["location" ],
94- )
68+ with doc ["relationships" ].row () as relationship :
69+ # relationship between two entities
70+ entity_relationship .collect (
71+ id = cocoindex .GeneratedField .UUID ,
72+ subject = relationship ["subject" ],
73+ object = relationship ["object" ],
74+ predicate = relationship ["predicate" ],
75+ )
76+ # mention of an entity in a document, for subject
77+ entity_mention .collect (
78+ id = cocoindex .GeneratedField .UUID , entity = relationship ["subject" ],
79+ filename = doc ["filename" ],
80+ )
81+ # mention of an entity in a document, for object
82+ entity_mention .collect (
83+ id = cocoindex .GeneratedField .UUID , entity = relationship ["object" ],
84+ filename = doc ["filename" ],
85+ )
86+
9587
88+ # export to neo4j
9689 document_node .export (
9790 "document_node" ,
9891 cocoindex .storages .Neo4j (
9992 connection = conn_spec ,
10093 mapping = cocoindex .storages .NodeMapping (label = "Document" )),
10194 primary_key_fields = ["filename" ],
10295 )
96+ # Declare reference Node to reference entity node in a relationship
10397 flow_builder .declare (
10498 cocoindex .storages .Neo4jDeclarations (
10599 connection = conn_spec ,
106100 referenced_nodes = [
107101 cocoindex .storages .ReferencedNode (
108102 label = "Entity" ,
109103 primary_key_fields = ["value" ],
110- vector_indexes = [
111- cocoindex .VectorIndexDef (
112- field_name = "embedding" ,
113- metric = cocoindex .VectorSimilarityMetric .COSINE_SIMILARITY ,
114- ),
115- ],
116104 )
117105 ]
118106 )
@@ -128,17 +116,13 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
128116 fields = [
129117 cocoindex .storages .TargetFieldMapping (
130118 source = "subject" , target = "value" ),
131- cocoindex .storages .TargetFieldMapping (
132- source = "subject_embedding" , target = "embedding" ),
133119 ]
134120 ),
135121 target = cocoindex .storages .NodeReferenceMapping (
136122 label = "Entity" ,
137123 fields = [
138124 cocoindex .storages .TargetFieldMapping (
139125 source = "object" , target = "value" ),
140- cocoindex .storages .TargetFieldMapping (
141- source = "object_embedding" , target = "embedding" ),
142126 ]
143127 ),
144128 ),
0 commit comments