@@ -28,9 +28,7 @@ async def _embed_chunks(
2828) -> tuple [pd .DataFrame , np .ndarray ]:
2929 """Convert text chunks into dense text embeddings."""
3030 sampled_text_chunks = text_chunks .sample (n = min (n_subset_max , len (text_chunks )))
31- embeddings = await embedding_llm .aembed_batch (
32- sampled_text_chunks ["text" ].tolist ()
33- )
31+ embeddings = await embedding_llm .aembed_batch (sampled_text_chunks ["text" ].tolist ())
3432 return text_chunks , np .array (embeddings )
3533
3634
@@ -66,20 +64,20 @@ async def load_docs_in_chunks(
6664 dataset = await create_input (config .input , logger , root )
6765 chunk_config = config .chunks
6866 chunks_df = create_base_text_units (
69- documents = dataset ,
70- callbacks = NoopWorkflowCallbacks (),
71- group_by_columns = chunk_config .group_by_columns ,
72- size = chunk_size ,
73- overlap = overlap ,
74- encoding_model = chunk_config .encoding_model ,
75- strategy = chunk_config .strategy ,
76- prepend_metadata = chunk_config .prepend_metadata ,
77- chunk_size_includes_metadata = chunk_config .chunk_size_includes_metadata ,
67+ documents = dataset ,
68+ callbacks = NoopWorkflowCallbacks (),
69+ group_by_columns = chunk_config .group_by_columns ,
70+ size = chunk_size ,
71+ overlap = overlap ,
72+ encoding_model = chunk_config .encoding_model ,
73+ strategy = chunk_config .strategy ,
74+ prepend_metadata = chunk_config .prepend_metadata ,
75+ chunk_size_includes_metadata = chunk_config .chunk_size_includes_metadata ,
7876 )
7977
8078 # Depending on the select method, build the dataset
8179 if limit <= 0 or limit > len (chunks_df ):
82- logger .warning (f"Limit out of range, using default number of chunks: { LIMIT } " ) # noqa: G004
80+ logger .warning (f"Limit out of range, using default number of chunks: { LIMIT } " ) # noqa: G004
8381 limit = LIMIT
8482
8583 if select_method == DocSelectionType .TOP :
0 commit comments