55from dotenv import load_dotenv
66import cocoindex
77
8+ @dataclasses .dataclass
9+ class DocumentSummary :
10+ """Describe a summary of a document."""
11+ title : str
12+ summary : str
813
914@dataclasses .dataclass
1015class Relationship :
@@ -31,13 +36,25 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
3136 cocoindex .sources .LocalFile (path = "../../docs/docs/core" ,
3237 included_patterns = ["*.md" , "*.mdx" ]))
3338
34- relationships = data_scope .add_collector ()
39+ document_node = data_scope .add_collector ()
40+ entity_relationship = data_scope .add_collector ()
41+ entity_mention = data_scope .add_collector ()
3542
3643 with data_scope ["documents" ].row () as doc :
3744 doc ["chunks" ] = doc ["content" ].transform (
3845 cocoindex .functions .SplitRecursively (),
3946 language = "markdown" , chunk_size = 10000 )
4047
48+ doc ["summary" ] = doc ["content" ].transform (
49+ cocoindex .functions .ExtractByLlm (
50+ llm_spec = cocoindex .LlmSpec (
51+ api_type = cocoindex .LlmApiType .OPENAI , model = "gpt-4o" ),
52+ output_type = DocumentSummary ,
53+ instruction = "Please summarize the content of the document." ))
54+ document_node .collect (
55+ filename = doc ["filename" ], title = doc ["summary" ]["title" ],
56+ summary = doc ["summary" ]["summary" ])
57+
4158 with doc ["chunks" ].row () as chunk :
4259 chunk ["relationships" ] = chunk ["text" ].transform (
4360 cocoindex .functions .ExtractByLlm (
@@ -59,17 +76,31 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
5976 relationship ["object_embedding" ] = relationship ["object" ].transform (
6077 cocoindex .functions .SentenceTransformerEmbed (
6178 model = "sentence-transformers/all-MiniLM-L6-v2" ))
62- relationships .collect (
79+ entity_relationship .collect (
6380 id = cocoindex .GeneratedField .UUID ,
6481 subject = relationship ["subject" ],
6582 subject_embedding = relationship ["subject_embedding" ],
6683 object = relationship ["object" ],
6784 object_embedding = relationship ["object_embedding" ],
6885 predicate = relationship ["predicate" ],
6986 )
70-
71- relationships .export (
72- "relationships" ,
87+ entity_mention .collect (
88+ id = cocoindex .GeneratedField .UUID , entity = relationship ["subject" ],
89+ filename = doc ["filename" ], location = chunk ["location" ],
90+ )
91+ entity_mention .collect (
92+ id = cocoindex .GeneratedField .UUID , entity = relationship ["object" ],
93+ filename = doc ["filename" ], location = chunk ["location" ],
94+ )
95+ document_node .export (
96+ "document_node" ,
97+ cocoindex .storages .Neo4j (
98+ connection = conn_spec ,
99+ mapping = cocoindex .storages .Neo4jNode (label = "Document" )),
100+ primary_key_fields = ["filename" ],
101+ )
102+ entity_relationship .export (
103+ "entity_relationship" ,
73104 cocoindex .storages .Neo4j (
74105 connection = conn_spec ,
75106 mapping = cocoindex .storages .Neo4jRelationship (
@@ -107,6 +138,25 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
107138 ),
108139 primary_key_fields = ["id" ],
109140 )
141+ entity_mention .export (
142+ "entity_mention" ,
143+ cocoindex .storages .Neo4j (
144+ connection = conn_spec ,
145+ mapping = cocoindex .storages .Neo4jRelationship (
146+ rel_type = "MENTION" ,
147+ source = cocoindex .storages .Neo4jRelationshipEnd (
148+ label = "Document" ,
149+ fields = [cocoindex .storages .Neo4jFieldMapping ("filename" )],
150+ ),
151+ target = cocoindex .storages .Neo4jRelationshipEnd (
152+ label = "Entity" ,
153+ fields = [cocoindex .storages .Neo4jFieldMapping (
154+ field_name = "entity" , node_field_name = "value" )],
155+ ),
156+ ),
157+ ),
158+ primary_key_fields = ["id" ],
159+ )
110160
111161@cocoindex .main_fn ()
112162def _run ():
0 commit comments