@@ -39,58 +39,62 @@ def __init__(self, config: Config):
3939 )
4040
4141 def load (self , paths : List [Path ]) -> List [Document ]:
42+ """Load files → Unstructured elements → grouped chunks."""
4243 all_chunks : List [Document ] = []
4344
4445 for path in paths :
4546 try :
4647 logger .info ("Partitioning %s" , path )
4748 elements = partition (filename = str (path ), strategy = "fast" )
4849
49- # 1) concatenate elements until we hit ~chunk_size chars
5050 buf : List [str ] = []
51+ seen : set [str ] = set ()
5152 buf_len = 0
53+ fname = Path (path ).name
54+
5255 for el in elements :
53- if not getattr (el , "text" , "" ):
54- continue
55- t = el .text .strip ()
56- if not t :
57- continue
56+ text = getattr (el , "text" , "" ).strip ()
57+ if not text or text in seen :
58+ continue # skip blanks & exact duplicates
59+ seen .add (text )
60+
61+ # If starting a new group, add a tiny heading once
62+ if buf_len == 0 :
63+ buf .append (f"## { fname } \n " )
5864
59- if buf_len + len (t ) > self .config .chunk_size and buf :
65+ # Flush if adding this element would exceed chunk_size
66+ if buf_len + len (text ) > self .config .chunk_size and buf :
6067 all_chunks .append (
6168 Document (
62- page_content = "\n " .join (buf ),
69+ page_content = "\n " .join (buf ). strip () ,
6370 metadata = {"source" : str (path )},
6471 )
6572 )
66- buf , buf_len = [] , 0
73+ buf , seen , buf_len = [f"## { fname } \n " ], set () , 0
6774
68- buf .append (t )
69- buf_len += len (t )
75+ buf .append (text )
76+ buf_len += len (text )
7077
71- # flush remainder
72- if buf :
78+ # — flush remainder —
79+ if buf_len :
7380 all_chunks .append (
7481 Document (
75- page_content = "\n " .join (buf ),
82+ page_content = "\n " .join (buf ). strip () ,
7683 metadata = {"source" : str (path )},
7784 )
7885 )
7986
8087 except Exception as e :
8188 logger .warning ("Failed to load %s: %s" , path , e )
8289
83- # 2) optional secondary splitter for *very* long docs
90+ # 2) secondary split for *very* large groups
8491 final_docs : List [Document ] = []
8592 for doc in all_chunks :
8693 if len (doc .page_content ) > self .config .chunk_size * 2 :
8794 final_docs .extend (self .splitter .split_documents ([doc ]))
8895 else :
8996 final_docs .append (doc )
9097
91- logger .info (
92- "Produced %d chunks (avg %.0f chars)" ,
93- len (final_docs ),
94- sum (len (d .page_content ) for d in final_docs ) / max (1 , len (final_docs )),
95- )
98+ avg = sum (len (d .page_content ) for d in final_docs ) / max (1 , len (final_docs ))
99+ logger .info ("Produced %d chunks (avg %.0f chars)" , len (final_docs ), avg )
96100 return final_docs
0 commit comments