@@ -111,6 +111,28 @@ def run( # type: ignore
111111
112112 return output_filepath
113113
114+ def _add_entities (self , element : dict , graph : "Graph" , element_node : _Node ) -> None :
115+ entities = element .get ("metadata" , {}).get ("entities" , [])
116+ if not entities :
117+ return None
118+ if not isinstance (entities , list ):
119+ return None
120+
121+ for entity in entities :
122+ if not isinstance (entity , dict ):
123+ continue
124+ if "entity" not in entity or "type" not in entity :
125+ continue
126+ entity_node = _Node (
127+ labels = [Label .ENTITY ], properties = {"id" : entity ["entity" ]}, id_ = entity ["entity" ]
128+ )
129+ graph .add_edge (
130+ entity_node ,
131+ _Node (labels = [Label .ENTITY ], properties = {"id" : entity ["type" ]}, id_ = entity ["type" ]),
132+ relationship = Relationship .ENTITY_TYPE ,
133+ )
134+ graph .add_edge (element_node , entity_node , relationship = Relationship .HAS_ENTITY )
135+
114136 def _create_lexical_graph (self , elements : list [dict ], document_node : _Node ) -> "Graph" :
115137 import networkx as nx
116138
@@ -129,25 +151,23 @@ def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "
129151 previous_node = element_node
130152 graph .add_edge (element_node , document_node , relationship = Relationship .PART_OF_DOCUMENT )
131153
154+ self ._add_entities (element , graph , element_node )
155+
132156 if self ._is_chunk (element ):
133- origin_element_nodes = [
134- self ._create_element_node (origin_element )
135- for origin_element in format_and_truncate_orig_elements (element )
136- ]
137- graph .add_edges_from (
138- [
139- (origin_element_node , element_node )
140- for origin_element_node in origin_element_nodes
141- ],
142- relationship = Relationship .PART_OF_CHUNK ,
143- )
144- graph .add_edges_from (
145- [
146- (origin_element_node , document_node )
147- for origin_element_node in origin_element_nodes
148- ],
149- relationship = Relationship .PART_OF_DOCUMENT ,
150- )
157+ for origin_element in format_and_truncate_orig_elements (element ):
158+ origin_element_node = self ._create_element_node (origin_element )
159+
160+ graph .add_edge (
161+ origin_element_node ,
162+ element_node ,
163+ relationship = Relationship .PART_OF_CHUNK ,
164+ )
165+ graph .add_edge (
166+ origin_element_node ,
167+ document_node ,
168+ relationship = Relationship .PART_OF_DOCUMENT ,
169+ )
170+ self ._add_entities (origin_element , graph , origin_element_node )
151171
152172 return graph
153173
@@ -231,13 +251,16 @@ class Label(Enum):
231251 UNSTRUCTURED_ELEMENT = "UnstructuredElement"
232252 CHUNK = "Chunk"
233253 DOCUMENT = "Document"
254+ ENTITY = "Entity"
234255
235256
236257class Relationship (Enum ):
237258 PART_OF_DOCUMENT = "PART_OF_DOCUMENT"
238259 PART_OF_CHUNK = "PART_OF_CHUNK"
239260 NEXT_CHUNK = "NEXT_CHUNK"
240261 NEXT_ELEMENT = "NEXT_ELEMENT"
262+ ENTITY_TYPE = "ENTITY_TYPE"
263+ HAS_ENTITY = "HAS_ENTITY"
241264
242265
243266class Neo4jUploaderConfig (UploaderConfig ):
0 commit comments