@@ -39,62 +39,66 @@ def __init__(self, config: Config):
3939 )
4040
4141 def load (self , paths : List [Path ]) -> List [Document ]:
42- """Load files → Unstructured elements → grouped chunks ."""
43- all_chunks : List [Document ] = []
42+ """Partition → group → (optional) secondary split → add chunk indices ."""
43+ grouped : list [Document ] = []
4444
4545 for path in paths :
4646 try :
4747 logger .info ("Partitioning %s" , path )
4848 elements = partition (filename = str (path ), strategy = "fast" )
4949
50- buf : List [str ] = []
51- seen : set [str ] = set ()
52- buf_len = 0
50+ buf , buf_len , chunk_idx = [], 0 , 0
5351 fname = Path (path ).name
5452
55- for el in elements :
56- text = getattr (el , "text" , "" ).strip ()
57- if not text or text in seen :
58- continue # skip blanks & exact duplicates
59- seen .add (text )
60-
61- # If starting a new group, add a tiny heading once
62- if buf_len == 0 :
63- buf .append (f"## { fname } \n " )
64-
65- # Flush if adding this element would exceed chunk_size
66- if buf_len + len (text ) > self .config .chunk_size and buf :
67- all_chunks .append (
68- Document (
69- page_content = "\n " .join (buf ).strip (),
70- metadata = {"source" : str (path )},
71- )
72- )
73- buf , seen , buf_len = [f"## { fname } \n " ], set (), 0
74-
75- buf .append (text )
76- buf_len += len (text )
77-
78- # — flush remainder —
79- if buf_len :
80- all_chunks .append (
53+ def _flush ():
54+ nonlocal buf , buf_len , chunk_idx
55+ if not buf_len :
56+ return
57+ grouped .append (
8158 Document (
8259 page_content = "\n " .join (buf ).strip (),
83- metadata = {"source" : str (path )},
60+ metadata = {
61+ "source" : str (path ),
62+ "chunk_id" : chunk_idx ,
63+ },
8464 )
8565 )
66+ buf , buf_len = [], 0
67+ chunk_idx += 1
68+
69+ for el in elements :
70+ txt = getattr (el , "text" , "" ).strip ()
71+ if not txt :
72+ continue
73+ if buf_len == 0 :
74+ buf .append (f"## { fname } \n " ) # one heading per chunk
75+ if buf_len + len (txt ) > self .config .chunk_size :
76+ _flush ()
77+ buf .append (txt )
78+ buf_len += len (txt )
79+ _flush ()
8680
8781 except Exception as e :
8882 logger .warning ("Failed to load %s: %s" , path , e )
8983
90- # 2) secondary split for *very* large groups
91- final_docs : List [ Document ] = []
92- for doc in all_chunks :
84+ # — optional secondary split for ultra‑long groups —
85+ final_docs = []
86+ for doc in grouped :
9387 if len (doc .page_content ) > self .config .chunk_size * 2 :
9488 final_docs .extend (self .splitter .split_documents ([doc ]))
9589 else :
9690 final_docs .append (doc )
9791
98- avg = sum (len (d .page_content ) for d in final_docs ) / max (1 , len (final_docs ))
99- logger .info ("Produced %d chunks (avg %.0f chars)" , len (final_docs ), avg )
92+ # annotate chunk_total (needed only once per file)
93+ counts : dict [str , int ] = {}
94+ for d in final_docs :
95+ counts [d .metadata ["source" ]] = counts .get (d .metadata ["source" ], 0 ) + 1
96+ for d in final_docs :
97+ d .metadata ["chunk_total" ] = counts [d .metadata ["source" ]]
98+
99+ logger .info (
100+ "Produced %d chunks (avg %.0f chars)" ,
101+ len (final_docs ),
102+ sum (len (d .page_content ) for d in final_docs ) / max (1 , len (final_docs )),
103+ )
100104 return final_docs
0 commit comments