Skip to content

Commit f5fd84f

Browse files
authored
Feat/usage reporting page (#80)
* Pushing page usage * Pushing page usage * Updating pdf plumber dependecny * Adding run id * Bumped up the version * Bumped up the patch version * Handled page calculation for LLM whsiperer * Moving usage push post processing * Added condition for empty page string * Expecting file name to passed via usage kwargs * updated the version to 0.42.1 * updated the version to 0.44.0 --------- Signed-off-by: Rahul Johny <[email protected]>
1 parent 28434ca commit f5fd84f

File tree

8 files changed

+256
-32
lines changed

8 files changed

+256
-32
lines changed

pdm.lock

Lines changed: 54 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ dependencies = [
5757
# For singleton classes
5858
"singleton-decorator~=1.0.0",
5959
"httpx>=0.25.2",
60+
"pdfplumber>=0.11.2",
6061
]
6162
readme = "README.md"
6263
urls = { Homepage = "https://unstract.com", "Release notes" = "https://github.com/Zipstack/unstract-sdk/releases", Source = "https://github.com/Zipstack/unstract-sdk" }

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.43.0"
1+
__version__ = "0.44.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/audit.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,49 @@ def push_usage_data(
113113
finally:
114114
if isinstance(token_counter, TokenCountingHandler):
115115
token_counter.reset_counts()
116+
117+
def push_page_usage_data(
118+
self,
119+
platform_api_key: str,
120+
page_count: int,
121+
file_size: int,
122+
file_type: str,
123+
kwargs: dict[Any, Any] = None,
124+
) -> None:
125+
platform_host = self.get_env_or_die(ToolEnv.PLATFORM_HOST)
126+
platform_port = self.get_env_or_die(ToolEnv.PLATFORM_PORT)
127+
run_id = kwargs.get("run_id", "")
128+
file_name = kwargs.get("file_name", "")
129+
base_url = SdkHelper.get_platform_base_url(
130+
platform_host=platform_host, platform_port=platform_port
131+
)
132+
bearer_token = platform_api_key
133+
url = f"{base_url}/page-usage"
134+
headers = {"Authorization": f"Bearer {bearer_token}"}
135+
136+
data = {
137+
"page_count": page_count,
138+
"file_name": file_name,
139+
"file_size": file_size,
140+
"file_type": file_type,
141+
"run_id": run_id,
142+
}
143+
144+
try:
145+
response = requests.post(url, headers=headers, json=data, timeout=30)
146+
if response.status_code != 200:
147+
self.stream_log(
148+
log=(
149+
"Error while pushing page usage details: "
150+
f"{response.status_code} {response.reason}",
151+
),
152+
level=LogLevel.ERROR,
153+
)
154+
else:
155+
self.stream_log("Successfully pushed page usage details")
156+
157+
except requests.RequestException as e:
158+
self.stream_log(
159+
log=f"Error while pushing page usage details: {e}",
160+
level=LogLevel.ERROR,
161+
)

src/unstract/sdk/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,8 @@ class PublicAdapterKeys:
158158
PUBLIC_EMBEDDING_CONFIG = "PUBLIC_EMBEDDING_CONFIG"
159159
PUBLIC_VECTOR_DB_CONFIG = "PUBLIC_VECTOR_DB_CONFIG"
160160
PUBLIC_X2TEXT_CONFIG = "PUBLIC_X2TEXT_CONFIG"
161+
162+
163+
class MimeType:
164+
PDF = "application/pdf"
165+
TEXT = "text/plain"

src/unstract/sdk/index.py

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -225,35 +225,32 @@ def index(
225225
full_text = []
226226
extracted_text = ""
227227
try:
228-
mime_type = ToolUtils.get_file_mime_type(file_path)
229-
if mime_type == "text/plain":
230-
with open(file_path, encoding="utf-8") as file:
231-
extracted_text = file.read()
228+
x2text = X2Text(
229+
tool=self.tool,
230+
adapter_instance_id=x2text_instance_id,
231+
usage_kwargs=usage_kwargs,
232+
)
233+
if enable_highlight and isinstance(
234+
x2text._x2text_instance, LLMWhisperer
235+
):
236+
process_response: TextExtractionResult = x2text.process(
237+
input_file_path=file_path,
238+
output_file_path=output_file_path,
239+
enable_highlight=enable_highlight,
240+
)
241+
whisper_hash_value = (
242+
process_response.extraction_metadata.whisper_hash
243+
)
244+
245+
metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
246+
247+
self.tool.update_exec_metadata(metadata)
248+
232249
else:
233-
x2text = X2Text(
234-
tool=self.tool, adapter_instance_id=x2text_instance_id
250+
process_response: TextExtractionResult = x2text.process(
251+
input_file_path=file_path,
252+
output_file_path=output_file_path,
235253
)
236-
if enable_highlight and isinstance(
237-
x2text._x2text_instance, LLMWhisperer
238-
):
239-
process_response: TextExtractionResult = x2text.process(
240-
input_file_path=file_path,
241-
output_file_path=output_file_path,
242-
enable_highlight=enable_highlight,
243-
)
244-
whisper_hash_value = (
245-
process_response.extraction_metadata.whisper_hash
246-
)
247-
248-
metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
249-
250-
self.tool.update_exec_metadata(metadata)
251-
252-
else:
253-
process_response: TextExtractionResult = x2text.process(
254-
input_file_path=file_path,
255-
output_file_path=output_file_path,
256-
)
257254

258255
extracted_text = process_response.extracted_text
259256
except AdapterError as e:

src/unstract/sdk/utils/tool_utils.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,23 @@ def get_file_mime_type(input_file: Path) -> str:
101101
input_file_obj.seek(0)
102102
return input_file_mime
103103

104+
@staticmethod
105+
def get_file_size(input_file: Path) -> int:
106+
"""Gets the file size in bytes for an input file.
107+
Args:
108+
input_file (Path): Path object of the input file
109+
110+
Returns:
111+
str: MIME type of the file
112+
"""
113+
with open(input_file, mode="rb") as input_file_obj:
114+
input_file_obj.seek(0, 2) # Move the cursor to the end of the file
115+
file_length = (
116+
input_file_obj.tell()
117+
) # Get the current position of the cursor, which is the file length
118+
input_file_obj.seek(0)
119+
return file_length
120+
104121
@staticmethod
105122
def str_to_bool(string: str) -> bool:
106123
"""String value of boolean to boolean.
@@ -114,3 +131,57 @@ def str_to_bool(string: str) -> bool:
114131
bool
115132
"""
116133
return string.lower() == "true"
134+
135+
# Used the same function from LLM Whisperer
136+
@staticmethod
137+
def calculate_page_count(
138+
pages_string: str, max_page: int = 0, min_page: int = 1
139+
) -> int:
140+
"""Calculates the total number of pages based on the input string of
141+
page numbers or ranges.
142+
143+
Parses the input 'pages_string' to extract individual page numbers or
144+
ranges separated by commas.
145+
Supports ranges like '1-5' or open-ended ranges like '4-'.
146+
The 'max_page' parameter defines the upper limit for page numbers.
147+
The 'min_page' parameter defines the lower limit for page numbers.
148+
149+
Args:
150+
pages_string (str): String containing page numbers or ranges
151+
separated by commas
152+
max_page (int): Upper limit for page numbers (default is 0)
153+
min_page (int): Lower limit for page numbers (default is 1)
154+
155+
Returns:
156+
int: Total count of individual pages extracted from the input string
157+
"""
158+
if not pages_string:
159+
return max_page
160+
pages_list: list[int] = []
161+
parts = pages_string.split(",")
162+
for part in parts:
163+
part = part.strip()
164+
if "-" in part:
165+
if part.startswith("-"): # e.g., "-5"
166+
end = int(part[1:])
167+
end = min(end, max_page)
168+
pages_list.extend(range(min_page, end + 1))
169+
elif part.endswith("-"): # e.g., "4-"
170+
start = int(part[:-1])
171+
if start < 0:
172+
start = 0
173+
if max_page is None:
174+
raise ValueError(
175+
"max_page must be defined for open-ended ranges like '4-'"
176+
)
177+
pages_list.extend(range(start, max_page + 1))
178+
else: # e.g., "1-5"
179+
start, end = map(int, part.split("-"))
180+
if start < 0:
181+
start = 0
182+
if end > max_page:
183+
end = max_page
184+
pages_list.extend(range(start, end + 1))
185+
else:
186+
pages_list.append(int(part))
187+
return len(pages_list)

src/unstract/sdk/x2txt.py

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,37 @@
11
from abc import ABCMeta
22
from typing import Any, Optional
33

4+
import pdfplumber
45
from typing_extensions import deprecated
56

67
from unstract.sdk.adapter import ToolAdapter
78
from unstract.sdk.adapters.constants import Common
89
from unstract.sdk.adapters.x2text import adapters
910
from unstract.sdk.adapters.x2text.constants import X2TextConstants
1011
from unstract.sdk.adapters.x2text.dto import TextExtractionResult
12+
from unstract.sdk.adapters.x2text.llm_whisperer.src import LLMWhisperer
13+
from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import WhispererConfig
1114
from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
12-
from unstract.sdk.constants import LogLevel
15+
from unstract.sdk.audit import Audit
16+
from unstract.sdk.constants import LogLevel, MimeType, ToolEnv
1317
from unstract.sdk.exceptions import X2TextError
1418
from unstract.sdk.helper import SdkHelper
1519
from unstract.sdk.tool.base import BaseTool
20+
from unstract.sdk.utils import ToolUtils
1621

1722

1823
class X2Text(metaclass=ABCMeta):
1924
def __init__(
2025
self,
2126
tool: BaseTool,
22-
adapter_instance_id: Optional[str] = None
27+
adapter_instance_id: Optional[str] = None,
28+
usage_kwargs: dict[Any, Any] = {},
2329
):
2430
self._tool = tool
2531
self._x2text_adapters = adapters
2632
self._adapter_instance_id = adapter_instance_id
2733
self._x2text_instance: X2TextAdapter = None
34+
self._usage_kwargs = usage_kwargs
2835
self._initialise()
2936

3037
def _initialise(self):
@@ -82,13 +89,57 @@ def process(
8289
output_file_path: Optional[str] = None,
8390
**kwargs: dict[Any, Any],
8491
) -> TextExtractionResult:
85-
return self._x2text_instance.process(
92+
mime_type = ToolUtils.get_file_mime_type(input_file_path)
93+
text_extraction_result: TextExtractionResult = None
94+
if mime_type == MimeType.TEXT:
95+
with open(input_file_path, encoding="utf-8") as file:
96+
extracted_text = file.read()
97+
text_extraction_result = TextExtractionResult(
98+
extracted_text=extracted_text, extraction_metadata=None
99+
)
100+
text_extraction_result = self._x2text_instance.process(
86101
input_file_path, output_file_path, **kwargs
87102
)
103+
# The will be executed each and every time text extraction takes place
104+
self.push_usage_details(input_file_path, mime_type)
105+
return text_extraction_result
88106

89107
@deprecated("Instantiate X2Text and call process() instead")
90108
def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter:
91109
if not self._x2text_instance:
92110
self._adapter_instance_id = adapter_instance_id
93111
self._initialise()
94112
return self._x2text_instance
113+
114+
def push_usage_details(self, input_file_path: str, mime_type: str) -> None:
115+
file_size = ToolUtils.get_file_size(input_file_path)
116+
117+
self._x2text_instance
118+
119+
if mime_type == MimeType.PDF:
120+
with pdfplumber.open(input_file_path) as pdf:
121+
# calculate the number of pages
122+
page_count = len(pdf.pages)
123+
if isinstance(self._x2text_instance, LLMWhisperer):
124+
self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT)
125+
page_count = ToolUtils.calculate_page_count(
126+
self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT),
127+
page_count,
128+
)
129+
Audit().push_page_usage_data(
130+
platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY),
131+
file_size=file_size,
132+
file_type=mime_type,
133+
page_count=page_count,
134+
kwargs=self._usage_kwargs,
135+
)
136+
else:
137+
# We are allowing certain image types,and raw texts. We will consider them
138+
# as single page documents as there in no concept of page numbers.
139+
Audit().push_page_usage_data(
140+
platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY),
141+
file_size=file_size,
142+
file_type=mime_type,
143+
page_count=1,
144+
kwargs=self._usage_kwargs,
145+
)

0 commit comments

Comments
 (0)