Skip to content

Commit fb34c3e

Browse files
authored
add chunk index to better find neighbors (#6)
1 parent b4a2b7f commit fb34c3e

File tree

1 file changed

+41
-37
lines changed

1 file changed

+41
-37
lines changed

loaders/text.py

Lines changed: 41 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -39,62 +39,66 @@ def __init__(self, config: Config):
3939
)
4040

4141
def load(self, paths: List[Path]) -> List[Document]:
42-
"""Load files → Unstructured elements → grouped chunks."""
43-
all_chunks: List[Document] = []
42+
"""Partition → group → (optional) secondary split → add chunk indices."""
43+
grouped: list[Document] = []
4444

4545
for path in paths:
4646
try:
4747
logger.info("Partitioning %s", path)
4848
elements = partition(filename=str(path), strategy="fast")
4949

50-
buf: List[str] = []
51-
seen: set[str] = set()
52-
buf_len = 0
50+
buf, buf_len, chunk_idx = [], 0, 0
5351
fname = Path(path).name
5452

55-
for el in elements:
56-
text = getattr(el, "text", "").strip()
57-
if not text or text in seen:
58-
continue # skip blanks & exact duplicates
59-
seen.add(text)
60-
61-
# If starting a new group, add a tiny heading once
62-
if buf_len == 0:
63-
buf.append(f"## {fname}\n")
64-
65-
# Flush if adding this element would exceed chunk_size
66-
if buf_len + len(text) > self.config.chunk_size and buf:
67-
all_chunks.append(
68-
Document(
69-
page_content="\n".join(buf).strip(),
70-
metadata={"source": str(path)},
71-
)
72-
)
73-
buf, seen, buf_len = [f"## {fname}\n"], set(), 0
74-
75-
buf.append(text)
76-
buf_len += len(text)
77-
78-
# — flush remainder —
79-
if buf_len:
80-
all_chunks.append(
53+
def _flush():
54+
nonlocal buf, buf_len, chunk_idx
55+
if not buf_len:
56+
return
57+
grouped.append(
8158
Document(
8259
page_content="\n".join(buf).strip(),
83-
metadata={"source": str(path)},
60+
metadata={
61+
"source": str(path),
62+
"chunk_id": chunk_idx,
63+
},
8464
)
8565
)
66+
buf, buf_len = [], 0
67+
chunk_idx += 1
68+
69+
for el in elements:
70+
txt = getattr(el, "text", "").strip()
71+
if not txt:
72+
continue
73+
if buf_len == 0:
74+
buf.append(f"## {fname}\n") # one heading per chunk
75+
if buf_len + len(txt) > self.config.chunk_size:
76+
_flush()
77+
buf.append(txt)
78+
buf_len += len(txt)
79+
_flush()
8680

8781
except Exception as e:
8882
logger.warning("Failed to load %s: %s", path, e)
8983

90-
# 2) secondary split for *very* large groups
91-
final_docs: List[Document] = []
92-
for doc in all_chunks:
84+
# — optional secondary split for ultra‑long groups
85+
final_docs = []
86+
for doc in grouped:
9387
if len(doc.page_content) > self.config.chunk_size * 2:
9488
final_docs.extend(self.splitter.split_documents([doc]))
9589
else:
9690
final_docs.append(doc)
9791

98-
avg = sum(len(d.page_content) for d in final_docs) / max(1, len(final_docs))
99-
logger.info("Produced %d chunks (avg %.0f chars)", len(final_docs), avg)
92+
# annotate chunk_total (needed only once per file)
93+
counts: dict[str, int] = {}
94+
for d in final_docs:
95+
counts[d.metadata["source"]] = counts.get(d.metadata["source"], 0) + 1
96+
for d in final_docs:
97+
d.metadata["chunk_total"] = counts[d.metadata["source"]]
98+
99+
logger.info(
100+
"Produced %d chunks (avg %.0f chars)",
101+
len(final_docs),
102+
sum(len(d.page_content) for d in final_docs) / max(1, len(final_docs)),
103+
)
100104
return final_docs

0 commit comments

Comments
 (0)