Skip to content

Commit 95c4783

Browse files
fix: fix return logic
1 parent b458e48 commit 95c4783

File tree

1 file changed

+25
-19
lines changed

1 file changed

+25
-19
lines changed

graphgen/operators/chunk/chunk_service.py

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
5050
return pd.DataFrame(self.chunk_documents(docs))
5151

5252
def chunk_documents(self, new_docs: list) -> list:
53+
chunks = []
5354
for doc in new_docs:
5455
doc_id = doc.get("_doc_id")
5556
doc_type = doc.get("type")
@@ -62,24 +63,29 @@ def chunk_documents(self, new_docs: list) -> list:
6263
**self.chunk_kwargs,
6364
)
6465

65-
return [
66+
chunks.extend(
67+
[
68+
{
69+
"_chunk_id": compute_content_hash(
70+
chunk_text, prefix="chunk-"
71+
),
72+
"content": chunk_text,
73+
"type": "text",
74+
"_doc_id": doc_id,
75+
"length": len(self.tokenizer_instance.encode(chunk_text))
76+
if self.tokenizer_instance
77+
else len(chunk_text),
78+
"language": doc_language,
79+
}
80+
for chunk_text in text_chunks
81+
]
82+
)
83+
else:
84+
# other types of documents(images, sequences) are not chunked
85+
chunks.append(
6686
{
67-
"_chunk_id": compute_content_hash(chunk_text, prefix="chunk-"),
68-
"content": chunk_text,
69-
"type": "text",
70-
"_doc_id": doc_id,
71-
"length": len(self.tokenizer_instance.encode(chunk_text))
72-
if self.tokenizer_instance
73-
else len(chunk_text),
74-
"language": doc_language,
87+
"_chunk_id": doc_id.replace("doc-", f"{doc_type}-"),
88+
**doc,
7589
}
76-
for chunk_text in text_chunks
77-
]
78-
79-
# other types of documents(images, sequences) are not chunked
80-
return [
81-
{
82-
"_chunk_id": doc_id.replace("doc-", f"{doc_type}-"),
83-
**doc,
84-
}
85-
]
90+
)
91+
return chunks

0 commit comments

Comments
 (0)