|
18 | 18 |
|
19 | 19 | class ToolIndex: |
20 | 20 | def __init__(self, tool: BaseTool): |
| 21 | + # TODO: Inherit from StreamMixin and avoid using BaseTool |
21 | 22 | self.tool = tool |
22 | 23 |
|
23 | 24 | def get_text_from_index( |
@@ -133,27 +134,6 @@ def index_file( |
133 | 134 | if not file_hash: |
134 | 135 | file_hash = ToolUtils.get_hash_from_file(file_path=file_path) |
135 | 136 |
|
136 | | - self.tool.stream_log("Extracting text from input file") |
137 | | - full_text = [] |
138 | | - extracted_text = "" |
139 | | - try: |
140 | | - x2text = X2Text(tool=self.tool) |
141 | | - x2text_adapter_inst: X2TextAdapter = x2text.get_x2text( |
142 | | - adapter_instance_id=x2text_adapter |
143 | | - ) |
144 | | - extracted_text = x2text_adapter_inst.process( |
145 | | - input_file_path=file_path, output_file_path=output_file_path |
146 | | - ) |
147 | | - except AdapterError as e: |
148 | | - # Wrapping AdapterErrors with SdkError |
149 | | - raise SdkError(str(e)) from e |
150 | | - full_text.append( |
151 | | - { |
152 | | - "section": "full", |
153 | | - "text_contents": self._cleanup_text(extracted_text), |
154 | | - } |
155 | | - ) |
156 | | - |
157 | 137 | doc_id = ToolIndex.generate_file_id( |
158 | 138 | tool_id=tool_id, |
159 | 139 | file_hash=file_hash, |
@@ -224,6 +204,27 @@ def index_file( |
224 | 204 | doc_id_not_found = True |
225 | 205 |
|
226 | 206 | if doc_id_not_found: |
| 207 | + self.tool.stream_log("Extracting text from input file") |
| 208 | + full_text = [] |
| 209 | + extracted_text = "" |
| 210 | + try: |
| 211 | + x2text = X2Text(tool=self.tool) |
| 212 | + x2text_adapter_inst: X2TextAdapter = x2text.get_x2text( |
| 213 | + adapter_instance_id=x2text_adapter |
| 214 | + ) |
| 215 | + extracted_text = x2text_adapter_inst.process( |
| 216 | + input_file_path=file_path, output_file_path=output_file_path |
| 217 | + ) |
| 218 | + except AdapterError as e: |
| 219 | + # Wrapping AdapterErrors with SdkError |
| 220 | + raise SdkError(str(e)) from e |
| 221 | + full_text.append( |
| 222 | + { |
| 223 | + "section": "full", |
| 224 | + "text_contents": self._cleanup_text(extracted_text), |
| 225 | + } |
| 226 | + ) |
| 227 | + |
227 | 228 | # Check if chunking is required |
228 | 229 | documents = [] |
229 | 230 | for item in full_text: |
|
0 commit comments