Skip to content

Commit 11d2d9f

Browse files
authored
Capture the aggregate size of loaded documents
* Calculate the doc source size * Fix size variable Init error * Fixed imports
1 parent e71358e commit 11d2d9f

File tree

2 files changed

+24
-6
lines changed

2 files changed

+24
-6
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,4 +157,4 @@ cython_debug/
157157
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160-
#.idea/
160+
.idea/

pebblo_langchain/langchain_community/document_loaders/pebblo.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,13 @@ def __init__(
4545
self.docs = []
4646
loader_name = str(type(self.loader)).split(".")[-1].split("'")[0]
4747
self.source_type = get_loader_type(loader_name)
48-
self.source_size = self.get_source_size(self.source_path)
48+
self.source_path_size = self.get_source_size(self.source_path)
49+
self.source_aggr_size = 0
4950
self.loader_details = {
5051
"loader": loader_name,
5152
"source_path": self.source_path,
5253
"source_type": self.source_type,
53-
"source_size": self.source_size,
54+
"source_path_size": self.source_path_size,
5455
}
5556
# generate app
5657
self.app = self._get_app_details()
@@ -98,14 +99,16 @@ def _send_loader_doc(self, loading_end=False):
9899
for doc in doc_content:
99100
doc_source_path = get_full_path(doc.get("metadata", {}).get("source"))
100101
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(doc_source_path)
101-
doc_source_size = self.get_source_size(doc_source_path)
102+
page_content = doc.get("page_content")
103+
doc_source_size = self.calculate_content_size(page_content)
104+
self.source_aggr_size += doc_source_size
102105
docs.append(
103106
{
104-
"doc": doc.get("page_content"),
107+
"doc": page_content,
105108
"source_path": doc_source_path,
106109
"last_modified": doc.get("metadata", {}).get("last_modified"),
107110
"file_owner": doc_source_owner,
108-
"source_size": doc_source_size,
111+
"source_path_size": doc_source_size,
109112
}
110113
)
111114
payload = {
@@ -120,6 +123,7 @@ def _send_loader_doc(self, loading_end=False):
120123
}
121124
if loading_end is True:
122125
payload["loading_end"] = "true"
126+
payload["loader_details"]["source_aggr_size"] = self.source_aggr_size
123127
try:
124128
payload = Doc.model_validate(payload).model_dump(exclude_unset=True)
125129
except AttributeError:
@@ -154,6 +158,19 @@ def _send_loader_doc(self, loading_end=False):
154158
if loading_end is True:
155159
PebbloSafeLoader.set_loader_sent()
156160

161+
@staticmethod
162+
def calculate_content_size(page_content):
163+
"""
164+
Calculate the content size in bytes:
165+
- Encode the string to bytes using a specific encoding (e.g., UTF-8)
166+
- Get the length of the encoded bytes.
167+
"""
168+
169+
# Encode the content to bytes using UTF-8
170+
encoded_content = page_content.encode('utf-8')
171+
size = len(encoded_content)
172+
return size
173+
157174
def _send_discover(self):
158175
headers = {"Accept": "application/json", "Content-Type": "application/json"}
159176
try:
@@ -212,6 +229,7 @@ def get_file_owner_from_path(file_path: str) -> str:
212229
return file_owner_name
213230

214231
def get_source_size(self, source_path: str) -> int:
232+
size = None
215233
if os.path.isfile(source_path):
216234
size = os.path.getsize(source_path)
217235
elif os.path.isdir(source_path):

0 commit comments

Comments
 (0)