|
16 | 16 | Tokenizer, |
17 | 17 | ) |
18 | 18 | from graphgen.operators import ( |
19 | | - build_mm_kg, |
20 | | - build_text_kg, |
| 19 | + build_kg, |
21 | 20 | chunk_documents, |
22 | 21 | generate_qas, |
23 | 22 | init_llm, |
@@ -96,109 +95,45 @@ async def insert(self, read_config: Dict, split_config: Dict): |
96 | 95 | new_docs = {compute_mm_hash(doc, prefix="doc-"): doc for doc in data} |
97 | 96 | _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys())) |
98 | 97 | new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys} |
99 | | - new_text_docs = {k: v for k, v in new_docs.items() if v.get("type") == "text"} |
100 | | - new_mm_docs = {k: v for k, v in new_docs.items() if v.get("type") != "text"} |
101 | | - |
102 | | - await self.full_docs_storage.upsert(new_docs) |
103 | | - |
104 | | - async def _insert_text_docs(text_docs): |
105 | | - if len(text_docs) == 0: |
106 | | - logger.warning("All text docs are already in the storage") |
107 | | - return |
108 | | - logger.info("[New Docs] inserting %d text docs", len(text_docs)) |
109 | | - # Step 2.1: Split chunks and filter existing ones |
110 | | - inserting_chunks = await chunk_documents( |
111 | | - text_docs, |
112 | | - split_config["chunk_size"], |
113 | | - split_config["chunk_overlap"], |
114 | | - self.tokenizer_instance, |
115 | | - self.progress_bar, |
116 | | - ) |
117 | 98 |
|
118 | | - _add_chunk_keys = await self.chunks_storage.filter_keys( |
119 | | - list(inserting_chunks.keys()) |
120 | | - ) |
121 | | - inserting_chunks = { |
122 | | - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys |
123 | | - } |
124 | | - |
125 | | - if len(inserting_chunks) == 0: |
126 | | - logger.warning("All text chunks are already in the storage") |
127 | | - return |
128 | | - |
129 | | - logger.info("[New Chunks] inserting %d text chunks", len(inserting_chunks)) |
130 | | - await self.chunks_storage.upsert(inserting_chunks) |
131 | | - |
132 | | - # Step 2.2: Extract entities and relations from text chunks |
133 | | - logger.info("[Text Entity and Relation Extraction] processing ...") |
134 | | - _add_entities_and_relations = await build_text_kg( |
135 | | - llm_client=self.synthesizer_llm_client, |
136 | | - kg_instance=self.graph_storage, |
137 | | - chunks=[ |
138 | | - Chunk(id=k, content=v["content"], type="text") |
139 | | - for k, v in inserting_chunks.items() |
140 | | - ], |
141 | | - progress_bar=self.progress_bar, |
142 | | - ) |
143 | | - if not _add_entities_and_relations: |
144 | | - logger.warning("No entities or relations extracted from text chunks") |
145 | | - return |
146 | | - |
147 | | - await self._insert_done() |
148 | | - return _add_entities_and_relations |
149 | | - |
150 | | - async def _insert_multi_modal_docs(mm_docs): |
151 | | - if len(mm_docs) == 0: |
152 | | - logger.warning("No multi-modal documents to insert") |
153 | | - return |
154 | | - |
155 | | - logger.info("[New Docs] inserting %d multi-modal docs", len(mm_docs)) |
156 | | - |
157 | | - # Step 3.1: Transform multi-modal documents into chunks and filter existing ones |
158 | | - inserting_chunks = await chunk_documents( |
159 | | - mm_docs, |
160 | | - split_config["chunk_size"], |
161 | | - split_config["chunk_overlap"], |
162 | | - self.tokenizer_instance, |
163 | | - self.progress_bar, |
164 | | - ) |
| 99 | + if len(new_docs) == 0: |
| 100 | + logger.warning("All documents are already in the storage") |
| 101 | + return |
165 | 102 |
|
166 | | - _add_chunk_keys = await self.chunks_storage.filter_keys( |
167 | | - list(inserting_chunks.keys()) |
168 | | - ) |
169 | | - inserting_chunks = { |
170 | | - k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys |
171 | | - } |
| 103 | + inserting_chunks = await chunk_documents( |
| 104 | + new_docs, |
| 105 | + split_config["chunk_size"], |
| 106 | + split_config["chunk_overlap"], |
| 107 | + self.tokenizer_instance, |
| 108 | + self.progress_bar, |
| 109 | + ) |
172 | 110 |
|
173 | | - if len(inserting_chunks) == 0: |
174 | | - logger.warning("All multi-modal chunks are already in the storage") |
175 | | - return |
| 111 | + _add_chunk_keys = await self.chunks_storage.filter_keys( |
| 112 | + list(inserting_chunks.keys()) |
| 113 | + ) |
| 114 | + inserting_chunks = { |
| 115 | + k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys |
| 116 | + } |
176 | 117 |
|
177 | | - logger.info( |
178 | | - "[New Chunks] inserting %d multimodal chunks", len(inserting_chunks) |
179 | | - ) |
180 | | - await self.chunks_storage.upsert(inserting_chunks) |
181 | | - |
182 | | - # Step 3.2: Extract multi-modal entities and relations from chunks |
183 | | - logger.info("[Multi-modal Entity and Relation Extraction] processing ...") |
184 | | - _add_entities_and_relations = await build_mm_kg( |
185 | | - llm_client=self.synthesizer_llm_client, |
186 | | - kg_instance=self.graph_storage, |
187 | | - chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()], |
188 | | - progress_bar=self.progress_bar, |
189 | | - ) |
190 | | - if not _add_entities_and_relations: |
191 | | - logger.warning( |
192 | | - "No entities or relations extracted from multi-modal chunks" |
193 | | - ) |
194 | | - return |
195 | | - await self._insert_done() |
196 | | - return _add_entities_and_relations |
197 | | - |
198 | | - # Step 2: Insert text documents |
199 | | - await _insert_text_docs(new_text_docs) |
200 | | - # Step 3: Insert multi-modal documents |
201 | | - await _insert_multi_modal_docs(new_mm_docs) |
| 118 | + if len(inserting_chunks) == 0: |
| 119 | + logger.warning("All chunks are already in the storage") |
| 120 | + return |
| 121 | + |
| 122 | + logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks)) |
| 123 | + await self.chunks_storage.upsert(inserting_chunks) |
| 124 | + |
| 125 | + _add_entities_and_relations = await build_kg( |
| 126 | + llm_client=self.synthesizer_llm_client, |
| 127 | + kg_instance=self.graph_storage, |
| 128 | + chunks=[Chunk.from_dict(k, v) for k, v in inserting_chunks.items()], |
| 129 | + progress_bar=self.progress_bar, |
| 130 | + ) |
| 131 | + if not _add_entities_and_relations: |
| 132 | + logger.warning("No entities or relations extracted from text chunks") |
| 133 | + return |
| 134 | + |
| 135 | + await self._insert_done() |
| 136 | + return _add_entities_and_relations |
202 | 137 |
|
203 | 138 | async def _insert_done(self): |
204 | 139 | tasks = [] |
|
0 commit comments