|
1 | | -import os |
2 | | -import shutil |
3 | | -import zipfile |
4 | 1 | from typing import Optional |
5 | 2 |
|
6 | | -import filetype |
7 | 3 | from llama_index import Document, StorageContext, VectorStoreIndex |
8 | 4 | from llama_index.node_parser import SimpleNodeParser |
9 | 5 | from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult |
| 6 | +from unstract.adapters.x2text.x2text_adapter import X2TextAdapter |
10 | 7 |
|
11 | 8 | from unstract.sdk.constants import LogLevel, ToolEnv |
12 | 9 | from unstract.sdk.embedding import ToolEmbedding |
|
15 | 12 | from unstract.sdk.utils import ToolUtils |
16 | 13 | from unstract.sdk.utils.service_context import ServiceContext |
17 | 14 | from unstract.sdk.vector_db import ToolVectorDB |
18 | | - |
19 | | -allowed_pdf_to_text_converters = [ |
20 | | - "default", |
21 | | - "unstract_llm_whisperer", |
22 | | - "unstract_camelot", |
23 | | -] |
| 15 | +from unstract.sdk.x2txt import X2Text |
24 | 16 |
|
25 | 17 |
|
26 | 18 | class ToolIndex: |
@@ -106,93 +98,30 @@ def index_file( |
106 | 98 | tool_id: str, |
107 | 99 | embedding_type: str, |
108 | 100 | vector_db: str, |
| 101 | + x2text_adapter: str, |
109 | 102 | file_path: str, |
110 | 103 | chunk_size: int, |
111 | 104 | chunk_overlap: int, |
112 | 105 | reindex: bool = False, |
113 | | - converter: str = "default", |
114 | 106 | file_hash: Optional[str] = None, |
115 | 107 | ): |
116 | | - if converter not in allowed_pdf_to_text_converters: |
117 | | - self.tool.stream_log( |
118 | | - "pdf-to-text-converters must be one of " |
119 | | - f"{allowed_pdf_to_text_converters}", |
120 | | - level=LogLevel.ERROR, |
121 | | - ) |
122 | | - raise SdkException( |
123 | | - "pdf-to-text-converters must be one of " |
124 | | - f"{allowed_pdf_to_text_converters}" |
125 | | - ) |
126 | | - |
127 | | - input_file_type = None |
128 | | - input_file_type_mime = None |
129 | | - |
130 | 108 | # Make file content hash if not available |
131 | 109 | if not file_hash: |
132 | 110 | file_hash = ToolUtils.get_hash_from_file(file_path=file_path) |
133 | | - with open(file_path, mode="rb") as input_file_obj: |
134 | | - sample_contents = input_file_obj.read(100) |
135 | | - input_file_type = filetype.guess(sample_contents) |
136 | | - |
137 | | - if input_file_type is None: |
138 | | - input_file_type_mime = "text/plain" |
139 | | - else: |
140 | | - input_file_type_mime = input_file_type.MIME |
141 | | - |
142 | | - self.tool.stream_log(f"Input file type: {input_file_type_mime}") |
143 | 111 |
|
| 112 | + self.tool.stream_log("Extracting text from input file") |
144 | 113 | full_text = [] |
145 | | - |
146 | | - if input_file_type_mime == "text/plain": |
147 | | - with open(file_path) as input_file_obj: |
148 | | - full_text.append( |
149 | | - { |
150 | | - "section": "full", |
151 | | - "text_contents": self._cleanup_text( |
152 | | - input_file_obj.read() |
153 | | - ), |
154 | | - } |
155 | | - ) |
156 | | - |
157 | | - elif input_file_type_mime == "application/pdf": |
158 | | - raise SdkException( |
159 | | - "Indexing of PDF files is not supported currently" |
160 | | - ) |
161 | | - # TODO: Make use of adapters to convert X2Text |
162 | | - # self.tool.stream_log(f"PDF to text converter: {converter}") |
163 | | - # if converter == "unstract_llm_whisperer" or converter == "default": # noqa |
164 | | - # full_text.append( |
165 | | - # { |
166 | | - # "section": "full", |
167 | | - # "text_contents": self._cleanup_text( |
168 | | - # x2txt.generate_whisper( |
169 | | - # input_file=file_path, |
170 | | - # mode="text", |
171 | | - # dump_text=True, |
172 | | - # ) |
173 | | - # ), |
174 | | - # } |
175 | | - # ) |
176 | | - # else: |
177 | | - # # TODO : Support for Camelot |
178 | | - # x2txt = X2Text(tool=self.tool) |
179 | | - |
180 | | - elif input_file_type_mime == "application/zip": |
181 | | - self.tool.stream_log("Zip file extraction required") |
182 | | - with zipfile.ZipFile(file_path, "r") as zip_ref: |
183 | | - file_name_from_path = os.path.basename(file_path) |
184 | | - temp_directory = f"/tmp/unstract_zip/{file_name_from_path}" |
185 | | - # If temp_directory exists, delete it and create it again |
186 | | - if os.path.exists(temp_directory): |
187 | | - shutil.rmtree(temp_directory) |
188 | | - os.makedirs(temp_directory) |
189 | | - zip_ref.extractall(temp_directory) |
190 | | - else: |
191 | | - self.tool.stream_log( |
192 | | - f"Unsupported file type: {input_file_type_mime}", |
193 | | - level=LogLevel.ERROR, |
194 | | - ) |
195 | | - raise SdkException(f"Unsupported file type: {input_file_type_mime}") |
| 114 | + x2text = X2Text(tool=self.tool) |
| 115 | + x2text_adapter: X2TextAdapter = x2text.get_x2text( |
| 116 | + adapter_instance_id=x2text_adapter |
| 117 | + ) |
| 118 | + extracted_text = x2text_adapter.process(input_file_path=file_path) |
| 119 | + full_text.append( |
| 120 | + { |
| 121 | + "section": "full", |
| 122 | + "text_contents": self._cleanup_text(extracted_text), |
| 123 | + } |
| 124 | + ) |
196 | 125 |
|
197 | 126 | doc_id = ToolIndex.generate_file_id( |
198 | 127 | tool_id=tool_id, |
|
0 commit comments