@@ -39,38 +39,58 @@ def __init__(self, config: Config):
3939 )
4040
4141 def load (self , paths : List [Path ]) -> List [Document ]:
42- """
43- Loads and splits a list of text files into structured and chunked LangChain documents.
44-
45- Args:
46- paths (List[Path]): List of file paths to load.
47-
48- Returns:
49- List[Document]: Chunked LangChain Document objects with metadata.
50- """
5142 all_chunks : List [Document ] = []
5243
5344 for path in paths :
5445 try :
55- logger .info ("Loading and partitioning: %s" , path )
46+ logger .info ("Partitioning %s" , path )
5647 elements = partition (filename = str (path ), strategy = "fast" )
5748
58- docs = [
59- Document (
60- page_content = element .text ,
61- metadata = {
62- "source" : str (path ),
63- ** (element .metadata .to_dict () if element .metadata else {}),
64- },
49+ # 1) concatenate elements until we hit ~chunk_size chars
50+ buf : List [str ] = []
51+ buf_len = 0
52+ for el in elements :
53+ if not getattr (el , "text" , "" ):
54+ continue
55+ t = el .text .strip ()
56+ if not t :
57+ continue
58+
59+ if buf_len + len (t ) > self .config .chunk_size and buf :
60+ all_chunks .append (
61+ Document (
62+ page_content = "\n " .join (buf ),
63+ metadata = {"source" : str (path )},
64+ )
65+ )
66+ buf , buf_len = [], 0
67+
68+ buf .append (t )
69+ buf_len += len (t )
70+
71+ # flush remainder
72+ if buf :
73+ all_chunks .append (
74+ Document (
75+ page_content = "\n " .join (buf ),
76+ metadata = {"source" : str (path )},
77+ )
6578 )
66- for element in elements
67- if hasattr (element , "text" ) and element .text
68- ]
69-
70- chunks = self .splitter .split_documents (docs )
71- all_chunks .extend (chunks )
7279
7380 except Exception as e :
7481 logger .warning ("Failed to load %s: %s" , path , e )
7582
76- return all_chunks
83+ # 2) optional secondary splitter for *very* long docs
84+ final_docs : List [Document ] = []
85+ for doc in all_chunks :
86+ if len (doc .page_content ) > self .config .chunk_size * 2 :
87+ final_docs .extend (self .splitter .split_documents ([doc ]))
88+ else :
89+ final_docs .append (doc )
90+
91+ logger .info (
92+ "Produced %d chunks (avg %.0f chars)" ,
93+ len (final_docs ),
94+ sum (len (d .page_content ) for d in final_docs ) / max (1 , len (final_docs )),
95+ )
96+ return final_docs
0 commit comments