@@ -45,12 +45,13 @@ def __init__(
4545 self .docs = []
4646 loader_name = str (type (self .loader )).split ("." )[- 1 ].split ("'" )[0 ]
4747 self .source_type = get_loader_type (loader_name )
48- self .source_size = self .get_source_size (self .source_path )
48+ self .source_path_size = self .get_source_size (self .source_path )
49+ self .source_aggr_size = 0
4950 self .loader_details = {
5051 "loader" : loader_name ,
5152 "source_path" : self .source_path ,
5253 "source_type" : self .source_type ,
53- "source_size " : self .source_size ,
54+ "source_path_size " : self .source_path_size ,
5455 }
5556 # generate app
5657 self .app = self ._get_app_details ()
@@ -98,14 +99,16 @@ def _send_loader_doc(self, loading_end=False):
9899 for doc in doc_content :
99100 doc_source_path = get_full_path (doc .get ("metadata" , {}).get ("source" ))
100101 doc_source_owner = PebbloSafeLoader .get_file_owner_from_path (doc_source_path )
101- doc_source_size = self .get_source_size (doc_source_path )
102+ page_content = doc .get ("page_content" )
103+ doc_source_size = self .calculate_content_size (page_content )
104+ self .source_aggr_size += doc_source_size
102105 docs .append (
103106 {
104- "doc" : doc . get ( " page_content" ) ,
107+ "doc" : page_content ,
105108 "source_path" : doc_source_path ,
106109 "last_modified" : doc .get ("metadata" , {}).get ("last_modified" ),
107110 "file_owner" : doc_source_owner ,
108- "source_size " : doc_source_size ,
111+ "source_path_size " : doc_source_size ,
109112 }
110113 )
111114 payload = {
@@ -120,6 +123,7 @@ def _send_loader_doc(self, loading_end=False):
120123 }
121124 if loading_end is True :
122125 payload ["loading_end" ] = "true"
126+ payload ["loader_details" ]["source_aggr_size" ] = self .source_aggr_size
123127 try :
124128 payload = Doc .model_validate (payload ).model_dump (exclude_unset = True )
125129 except AttributeError :
@@ -154,6 +158,19 @@ def _send_loader_doc(self, loading_end=False):
154158 if loading_end is True :
155159 PebbloSafeLoader .set_loader_sent ()
156160
161+ @staticmethod
162+ def calculate_content_size (page_content ):
163+ """
164+ Calculate the content size in bytes:
165+ - Encode the string to bytes using a specific encoding (e.g., UTF-8)
166+ - Get the length of the encoded bytes.
167+ """
168+
169+ # Encode the content to bytes using UTF-8
170+ encoded_content = page_content .encode ('utf-8' )
171+ size = len (encoded_content )
172+ return size
173+
157174 def _send_discover (self ):
158175 headers = {"Accept" : "application/json" , "Content-Type" : "application/json" }
159176 try :
@@ -212,6 +229,7 @@ def get_file_owner_from_path(file_path: str) -> str:
212229 return file_owner_name
213230
214231 def get_source_size (self , source_path : str ) -> int :
232+ size = None
215233 if os .path .isfile (source_path ):
216234 size = os .path .getsize (source_path )
217235 elif os .path .isdir (source_path ):
0 commit comments