Skip to content

Commit 8feb3e0

Browse files
committed
try deduping chunks at ingestion
1 parent 0fc2432 commit 8feb3e0

File tree

1 file changed

+24
-20
lines changed

1 file changed

+24
-20
lines changed

loaders/text.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39,58 +39,62 @@ def __init__(self, config: Config):
3939
)
4040

4141
def load(self, paths: List[Path]) -> List[Document]:
42+
"""Load files → Unstructured elements → grouped chunks."""
4243
all_chunks: List[Document] = []
4344

4445
for path in paths:
4546
try:
4647
logger.info("Partitioning %s", path)
4748
elements = partition(filename=str(path), strategy="fast")
4849

49-
# 1) concatenate elements until we hit ~chunk_size chars
5050
buf: List[str] = []
51+
seen: set[str] = set()
5152
buf_len = 0
53+
fname = Path(path).name
54+
5255
for el in elements:
53-
if not getattr(el, "text", ""):
54-
continue
55-
t = el.text.strip()
56-
if not t:
57-
continue
56+
text = getattr(el, "text", "").strip()
57+
if not text or text in seen:
58+
continue # skip blanks & exact duplicates
59+
seen.add(text)
60+
61+
# If starting a new group, add a tiny heading once
62+
if buf_len == 0:
63+
buf.append(f"## {fname}\n")
5864

59-
if buf_len + len(t) > self.config.chunk_size and buf:
65+
# Flush if adding this element would exceed chunk_size
66+
if buf_len + len(text) > self.config.chunk_size and buf:
6067
all_chunks.append(
6168
Document(
62-
page_content="\n".join(buf),
69+
page_content="\n".join(buf).strip(),
6370
metadata={"source": str(path)},
6471
)
6572
)
66-
buf, buf_len = [], 0
73+
buf, seen, buf_len = [f"## {fname}\n"], set(), 0
6774

68-
buf.append(t)
69-
buf_len += len(t)
75+
buf.append(text)
76+
buf_len += len(text)
7077

71-
# flush remainder
72-
if buf:
78+
# flush remainder
79+
if buf_len:
7380
all_chunks.append(
7481
Document(
75-
page_content="\n".join(buf),
82+
page_content="\n".join(buf).strip(),
7683
metadata={"source": str(path)},
7784
)
7885
)
7986

8087
except Exception as e:
8188
logger.warning("Failed to load %s: %s", path, e)
8289

83-
# 2) optional secondary splitter for *very* long docs
90+
# 2) secondary split for *very* large groups
8491
final_docs: List[Document] = []
8592
for doc in all_chunks:
8693
if len(doc.page_content) > self.config.chunk_size * 2:
8794
final_docs.extend(self.splitter.split_documents([doc]))
8895
else:
8996
final_docs.append(doc)
9097

91-
logger.info(
92-
"Produced %d chunks (avg %.0f chars)",
93-
len(final_docs),
94-
sum(len(d.page_content) for d in final_docs) / max(1, len(final_docs)),
95-
)
98+
avg = sum(len(d.page_content) for d in final_docs) / max(1, len(final_docs))
99+
logger.info("Produced %d chunks (avg %.0f chars)", len(final_docs), avg)
96100
return final_docs

0 commit comments

Comments
 (0)