|
1 | 1 | from abc import ABCMeta |
2 | 2 | from typing import Any, Optional |
3 | 3 |
|
| 4 | +import pdfplumber |
4 | 5 | from typing_extensions import deprecated |
5 | 6 |
|
6 | 7 | from unstract.sdk.adapter import ToolAdapter |
7 | 8 | from unstract.sdk.adapters.constants import Common |
8 | 9 | from unstract.sdk.adapters.x2text import adapters |
9 | 10 | from unstract.sdk.adapters.x2text.constants import X2TextConstants |
10 | 11 | from unstract.sdk.adapters.x2text.dto import TextExtractionResult |
| 12 | +from unstract.sdk.adapters.x2text.llm_whisperer.src import LLMWhisperer |
| 13 | +from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import WhispererConfig |
11 | 14 | from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter |
12 | | -from unstract.sdk.constants import LogLevel |
| 15 | +from unstract.sdk.audit import Audit |
| 16 | +from unstract.sdk.constants import LogLevel, MimeType, ToolEnv |
13 | 17 | from unstract.sdk.exceptions import X2TextError |
14 | 18 | from unstract.sdk.helper import SdkHelper |
15 | 19 | from unstract.sdk.tool.base import BaseTool |
| 20 | +from unstract.sdk.utils import ToolUtils |
16 | 21 |
|
17 | 22 |
|
18 | 23 | class X2Text(metaclass=ABCMeta): |
19 | 24 | def __init__( |
20 | 25 | self, |
21 | 26 | tool: BaseTool, |
22 | | - adapter_instance_id: Optional[str] = None |
| 27 | + adapter_instance_id: Optional[str] = None, |
| 28 | + usage_kwargs: dict[Any, Any] = {}, |
23 | 29 | ): |
24 | 30 | self._tool = tool |
25 | 31 | self._x2text_adapters = adapters |
26 | 32 | self._adapter_instance_id = adapter_instance_id |
27 | 33 | self._x2text_instance: X2TextAdapter = None |
| 34 | + self._usage_kwargs = usage_kwargs |
28 | 35 | self._initialise() |
29 | 36 |
|
30 | 37 | def _initialise(self): |
@@ -82,13 +89,57 @@ def process( |
82 | 89 | output_file_path: Optional[str] = None, |
83 | 90 | **kwargs: dict[Any, Any], |
84 | 91 | ) -> TextExtractionResult: |
85 | | - return self._x2text_instance.process( |
| 92 | + mime_type = ToolUtils.get_file_mime_type(input_file_path) |
| 93 | + text_extraction_result: TextExtractionResult = None |
| 94 | + if mime_type == MimeType.TEXT: |
| 95 | + with open(input_file_path, encoding="utf-8") as file: |
| 96 | + extracted_text = file.read() |
| 97 | + text_extraction_result = TextExtractionResult( |
| 98 | + extracted_text=extracted_text, extraction_metadata=None |
| 99 | + ) |
| 100 | + text_extraction_result = self._x2text_instance.process( |
86 | 101 | input_file_path, output_file_path, **kwargs |
87 | 102 | ) |
| 103 | + # The will be executed each and every time text extraction takes place |
| 104 | + self.push_usage_details(input_file_path, mime_type) |
| 105 | + return text_extraction_result |
88 | 106 |
|
89 | 107 | @deprecated("Instantiate X2Text and call process() instead") |
90 | 108 | def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter: |
91 | 109 | if not self._x2text_instance: |
92 | 110 | self._adapter_instance_id = adapter_instance_id |
93 | 111 | self._initialise() |
94 | 112 | return self._x2text_instance |
| 113 | + |
| 114 | + def push_usage_details(self, input_file_path: str, mime_type: str) -> None: |
| 115 | + file_size = ToolUtils.get_file_size(input_file_path) |
| 116 | + |
| 117 | + self._x2text_instance |
| 118 | + |
| 119 | + if mime_type == MimeType.PDF: |
| 120 | + with pdfplumber.open(input_file_path) as pdf: |
| 121 | + # calculate the number of pages |
| 122 | + page_count = len(pdf.pages) |
| 123 | + if isinstance(self._x2text_instance, LLMWhisperer): |
| 124 | + self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT) |
| 125 | + page_count = ToolUtils.calculate_page_count( |
| 126 | + self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT), |
| 127 | + page_count, |
| 128 | + ) |
| 129 | + Audit().push_page_usage_data( |
| 130 | + platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY), |
| 131 | + file_size=file_size, |
| 132 | + file_type=mime_type, |
| 133 | + page_count=page_count, |
| 134 | + kwargs=self._usage_kwargs, |
| 135 | + ) |
| 136 | + else: |
| 137 | + # We are allowing certain image types,and raw texts. We will consider them |
| 138 | + # as single page documents as there in no concept of page numbers. |
| 139 | + Audit().push_page_usage_data( |
| 140 | + platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY), |
| 141 | + file_size=file_size, |
| 142 | + file_type=mime_type, |
| 143 | + page_count=1, |
| 144 | + kwargs=self._usage_kwargs, |
| 145 | + ) |
0 commit comments