@@ -50,6 +50,7 @@ def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
5050 return pd .DataFrame (self .chunk_documents (docs ))
5151
5252 def chunk_documents (self , new_docs : list ) -> list :
53+ chunks = []
5354 for doc in new_docs :
5455 doc_id = doc .get ("_doc_id" )
5556 doc_type = doc .get ("type" )
@@ -62,24 +63,29 @@ def chunk_documents(self, new_docs: list) -> list:
6263 ** self .chunk_kwargs ,
6364 )
6465
65- return [
66+ chunks .extend (
67+ [
68+ {
69+ "_chunk_id" : compute_content_hash (
70+ chunk_text , prefix = "chunk-"
71+ ),
72+ "content" : chunk_text ,
73+ "type" : "text" ,
74+ "_doc_id" : doc_id ,
75+ "length" : len (self .tokenizer_instance .encode (chunk_text ))
76+ if self .tokenizer_instance
77+ else len (chunk_text ),
78+ "language" : doc_language ,
79+ }
80+ for chunk_text in text_chunks
81+ ]
82+ )
83+ else :
84+ # other types of documents(images, sequences) are not chunked
85+ chunks .append (
6686 {
67- "_chunk_id" : compute_content_hash (chunk_text , prefix = "chunk-" ),
68- "content" : chunk_text ,
69- "type" : "text" ,
70- "_doc_id" : doc_id ,
71- "length" : len (self .tokenizer_instance .encode (chunk_text ))
72- if self .tokenizer_instance
73- else len (chunk_text ),
74- "language" : doc_language ,
87+ "_chunk_id" : doc_id .replace ("doc-" , f"{ doc_type } -" ),
88+ ** doc ,
7589 }
76- for chunk_text in text_chunks
77- ]
78-
79- # other types of documents(images, sequences) are not chunked
80- return [
81- {
82- "_chunk_id" : doc_id .replace ("doc-" , f"{ doc_type } -" ),
83- ** doc ,
84- }
85- ]
90+ )
91+ return chunks
0 commit comments