Skip to content

Commit 0fc2432

Browse files
authored
combine partitions inside of chunk for more complete context in each chunk (#4)
1 parent 478e34b commit 0fc2432

File tree

1 file changed

+44
-24
lines changed

1 file changed

+44
-24
lines changed

loaders/text.py

Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -39,38 +39,58 @@ def __init__(self, config: Config):
3939
)
4040

4141
def load(self, paths: List[Path]) -> List[Document]:
42-
"""
43-
Loads and splits a list of text files into structured and chunked LangChain documents.
44-
45-
Args:
46-
paths (List[Path]): List of file paths to load.
47-
48-
Returns:
49-
List[Document]: Chunked LangChain Document objects with metadata.
50-
"""
5142
all_chunks: List[Document] = []
5243

5344
for path in paths:
5445
try:
55-
logger.info("Loading and partitioning: %s", path)
46+
logger.info("Partitioning %s", path)
5647
elements = partition(filename=str(path), strategy="fast")
5748

58-
docs = [
59-
Document(
60-
page_content=element.text,
61-
metadata={
62-
"source": str(path),
63-
**(element.metadata.to_dict() if element.metadata else {}),
64-
},
49+
# 1) concatenate elements until we hit ~chunk_size chars
50+
buf: List[str] = []
51+
buf_len = 0
52+
for el in elements:
53+
if not getattr(el, "text", ""):
54+
continue
55+
t = el.text.strip()
56+
if not t:
57+
continue
58+
59+
if buf_len + len(t) > self.config.chunk_size and buf:
60+
all_chunks.append(
61+
Document(
62+
page_content="\n".join(buf),
63+
metadata={"source": str(path)},
64+
)
65+
)
66+
buf, buf_len = [], 0
67+
68+
buf.append(t)
69+
buf_len += len(t)
70+
71+
# flush remainder
72+
if buf:
73+
all_chunks.append(
74+
Document(
75+
page_content="\n".join(buf),
76+
metadata={"source": str(path)},
77+
)
6578
)
66-
for element in elements
67-
if hasattr(element, "text") and element.text
68-
]
69-
70-
chunks = self.splitter.split_documents(docs)
71-
all_chunks.extend(chunks)
7279

7380
except Exception as e:
7481
logger.warning("Failed to load %s: %s", path, e)
7582

76-
return all_chunks
83+
# 2) optional secondary splitter for *very* long docs
84+
final_docs: List[Document] = []
85+
for doc in all_chunks:
86+
if len(doc.page_content) > self.config.chunk_size * 2:
87+
final_docs.extend(self.splitter.split_documents([doc]))
88+
else:
89+
final_docs.append(doc)
90+
91+
logger.info(
92+
"Produced %d chunks (avg %.0f chars)",
93+
len(final_docs),
94+
sum(len(d.page_content) for d in final_docs) / max(1, len(final_docs)),
95+
)
96+
return final_docs

0 commit comments

Comments
 (0)