Skip to content

Commit 069322c

Browse files
authored
Merge pull request #7 from Zipstack/index-file-update-with-x2text-adapter-usage
fix: Updated index_file() to use x2text adapter
2 parents e447e1e + 4a35b93 commit 069322c

File tree

6 files changed

+45
-113
lines changed

6 files changed

+45
-113
lines changed

pdm.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/unstract/sdk/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.10.1"
1+
__version__ = "0.11.0"
22

33

44
def get_sdk_version():

src/unstract/sdk/exceptions.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,3 @@ def __init__(
1111
@property
1212
def user_message(self) -> Optional[str]:
1313
return self._user_message
14-
15-
def __str__(self) -> str:
16-
return f"{self.message}"

src/unstract/sdk/index.py

Lines changed: 15 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
1-
import os
2-
import shutil
3-
import zipfile
41
from typing import Optional
52

6-
import filetype
73
from llama_index import Document, StorageContext, VectorStoreIndex
84
from llama_index.node_parser import SimpleNodeParser
95
from llama_index.vector_stores import VectorStoreQuery, VectorStoreQueryResult
6+
from unstract.adapters.x2text.x2text_adapter import X2TextAdapter
107

118
from unstract.sdk.constants import LogLevel, ToolEnv
129
from unstract.sdk.embedding import ToolEmbedding
@@ -15,12 +12,7 @@
1512
from unstract.sdk.utils import ToolUtils
1613
from unstract.sdk.utils.service_context import ServiceContext
1714
from unstract.sdk.vector_db import ToolVectorDB
18-
19-
allowed_pdf_to_text_converters = [
20-
"default",
21-
"unstract_llm_whisperer",
22-
"unstract_camelot",
23-
]
15+
from unstract.sdk.x2txt import X2Text
2416

2517

2618
class ToolIndex:
@@ -106,93 +98,30 @@ def index_file(
10698
tool_id: str,
10799
embedding_type: str,
108100
vector_db: str,
101+
x2text_adapter: str,
109102
file_path: str,
110103
chunk_size: int,
111104
chunk_overlap: int,
112105
reindex: bool = False,
113-
converter: str = "default",
114106
file_hash: Optional[str] = None,
115107
):
116-
if converter not in allowed_pdf_to_text_converters:
117-
self.tool.stream_log(
118-
"pdf-to-text-converters must be one of "
119-
f"{allowed_pdf_to_text_converters}",
120-
level=LogLevel.ERROR,
121-
)
122-
raise SdkException(
123-
"pdf-to-text-converters must be one of "
124-
f"{allowed_pdf_to_text_converters}"
125-
)
126-
127-
input_file_type = None
128-
input_file_type_mime = None
129-
130108
# Make file content hash if not available
131109
if not file_hash:
132110
file_hash = ToolUtils.get_hash_from_file(file_path=file_path)
133-
with open(file_path, mode="rb") as input_file_obj:
134-
sample_contents = input_file_obj.read(100)
135-
input_file_type = filetype.guess(sample_contents)
136-
137-
if input_file_type is None:
138-
input_file_type_mime = "text/plain"
139-
else:
140-
input_file_type_mime = input_file_type.MIME
141-
142-
self.tool.stream_log(f"Input file type: {input_file_type_mime}")
143111

112+
self.tool.stream_log("Extracting text from input file")
144113
full_text = []
145-
146-
if input_file_type_mime == "text/plain":
147-
with open(file_path) as input_file_obj:
148-
full_text.append(
149-
{
150-
"section": "full",
151-
"text_contents": self._cleanup_text(
152-
input_file_obj.read()
153-
),
154-
}
155-
)
156-
157-
elif input_file_type_mime == "application/pdf":
158-
raise SdkException(
159-
"Indexing of PDF files is not supported currently"
160-
)
161-
# TODO: Make use of adapters to convert X2Text
162-
# self.tool.stream_log(f"PDF to text converter: {converter}")
163-
# if converter == "unstract_llm_whisperer" or converter == "default": # noqa
164-
# full_text.append(
165-
# {
166-
# "section": "full",
167-
# "text_contents": self._cleanup_text(
168-
# x2txt.generate_whisper(
169-
# input_file=file_path,
170-
# mode="text",
171-
# dump_text=True,
172-
# )
173-
# ),
174-
# }
175-
# )
176-
# else:
177-
# # TODO : Support for Camelot
178-
# x2txt = X2Text(tool=self.tool)
179-
180-
elif input_file_type_mime == "application/zip":
181-
self.tool.stream_log("Zip file extraction required")
182-
with zipfile.ZipFile(file_path, "r") as zip_ref:
183-
file_name_from_path = os.path.basename(file_path)
184-
temp_directory = f"/tmp/unstract_zip/{file_name_from_path}"
185-
# If temp_directory exists, delete it and create it again
186-
if os.path.exists(temp_directory):
187-
shutil.rmtree(temp_directory)
188-
os.makedirs(temp_directory)
189-
zip_ref.extractall(temp_directory)
190-
else:
191-
self.tool.stream_log(
192-
f"Unsupported file type: {input_file_type_mime}",
193-
level=LogLevel.ERROR,
194-
)
195-
raise SdkException(f"Unsupported file type: {input_file_type_mime}")
114+
x2text = X2Text(tool=self.tool)
115+
x2text_adapter: X2TextAdapter = x2text.get_x2text(
116+
adapter_instance_id=x2text_adapter
117+
)
118+
extracted_text = x2text_adapter.process(input_file_path=file_path)
119+
full_text.append(
120+
{
121+
"section": "full",
122+
"text_contents": self._cleanup_text(extracted_text),
123+
}
124+
)
196125

197126
doc_id = ToolIndex.generate_file_id(
198127
tool_id=tool_id,

src/unstract/sdk/tool/validator.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
from pathlib import Path
44
from typing import Any
55

6-
import magic
76
from jsonschema import Draft202012Validator, ValidationError, validators
7+
88
from unstract.sdk.constants import MetadataKey, PropKey
99
from unstract.sdk.tool.base import BaseTool
1010
from unstract.sdk.tool.mime_types import EXT_MIME_MAP
11+
from unstract.sdk.utils import ToolUtils
1112

1213

1314
def extend_with_default(validator_class: Any) -> Any:
@@ -211,26 +212,10 @@ def _validate_file_type(self, input_file: Path) -> None:
211212
)
212213
allowed_mimes.append(EXT_MIME_MAP[ext])
213214

214-
input_file_mime = self._get_file_mime(input_file=input_file)
215+
input_file_mime = ToolUtils.get_file_mime_type(input_file=input_file)
216+
self.tool.stream_log(f"Input file MIME: {input_file_mime}")
215217
if input_file_mime not in allowed_mimes:
216218
self.tool.stream_error_and_exit(
217219
f"File type of {input_file_mime} is not supported by"
218220
" the tool, check its PROPERTIES for a list of supported types"
219221
)
220-
221-
def _get_file_mime(self, input_file: Path) -> str:
222-
"""Gets the file MIME type for an input file. Uses libmagic to perform
223-
the same.
224-
225-
Args:
226-
input_file (Path): Path object of the input file
227-
228-
Returns:
229-
str: MIME type of the file
230-
"""
231-
input_file_mime = ""
232-
with open(input_file, mode="rb") as input_file_obj:
233-
sample_contents = input_file_obj.read(100)
234-
input_file_mime = magic.from_buffer(sample_contents, mime=True)
235-
self.tool.stream_log(f"Input file MIME: {input_file_mime}")
236-
return input_file_mime

src/unstract/sdk/utils/tool_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import json
22
from hashlib import md5, sha256
3+
from pathlib import Path
34
from typing import Any
45

6+
import magic
7+
58
from unstract.sdk.constants import FileReaderSettings
69

710

@@ -75,3 +78,21 @@ def json_to_str(json_to_dump: dict[str, Any]) -> str:
7578
"""
7679
compact_json = json.dumps(json_to_dump, separators=(",", ":"))
7780
return compact_json
81+
82+
@staticmethod
83+
def get_file_mime_type(self, input_file: Path) -> str:
84+
"""Gets the file MIME type for an input file. Uses libmagic to perform
85+
the same.
86+
87+
Args:
88+
input_file (Path): Path object of the input file
89+
90+
Returns:
91+
str: MIME type of the file
92+
"""
93+
input_file_mime = ""
94+
with open(input_file, mode="rb") as input_file_obj:
95+
sample_contents = input_file_obj.read(100)
96+
input_file_mime = magic.from_buffer(sample_contents, mime=True)
97+
input_file_obj.seek(0)
98+
return input_file_mime

0 commit comments

Comments
 (0)