Skip to content

Commit 42d831a

Browse files
feat: update langchain, openai, and unstructured to latest versions
Co-Authored-By: [email protected] <[email protected]>
1 parent 651f07f commit 42d831a

File tree

8 files changed

+770
-732
lines changed

8 files changed

+770
-732
lines changed

airbyte_cdk/destinations/vector_db_based/document_processor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
from typing import Any, Dict, List, Mapping, Optional, Tuple
99

1010
import dpath
11-
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
12-
from langchain.utils import stringify_dict
1311
from langchain_core.documents.base import Document
12+
from langchain_core.utils import stringify_dict
13+
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
1414

1515
from airbyte_cdk.destinations.vector_db_based.config import (
1616
ProcessingConfigModel,

airbyte_cdk/destinations/vector_db_based/embedder.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
from dataclasses import dataclass
88
from typing import List, Optional, Union, cast
99

10-
from langchain.embeddings.cohere import CohereEmbeddings
11-
from langchain.embeddings.fake import FakeEmbeddings
12-
from langchain.embeddings.localai import LocalAIEmbeddings
13-
from langchain.embeddings.openai import OpenAIEmbeddings
10+
from langchain_community.embeddings.cohere import CohereEmbeddings
11+
from langchain_community.embeddings.fake import FakeEmbeddings
12+
from langchain_community.embeddings.localai import LocalAIEmbeddings
13+
from langchain_openai.embeddings import OpenAIEmbeddings
1414

1515
from airbyte_cdk.destinations.vector_db_based.config import (
1616
AzureOpenAIEmbeddingConfigModel,
@@ -106,30 +106,36 @@ def embedding_dimensions(self) -> int:
106106

107107
class OpenAIEmbedder(BaseOpenAIEmbedder):
108108
def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
109+
from pydantic import SecretStr
110+
109111
super().__init__(
110-
OpenAIEmbeddings( # type: ignore [call-arg]
111-
openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
112+
OpenAIEmbeddings(
113+
api_key=SecretStr(config.openai_key), max_retries=15, disallowed_special=()
112114
),
113115
chunk_size,
114-
) # type: ignore
116+
)
115117

116118

117119
class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
118120
def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
119121
# Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
122+
from pydantic import SecretStr
123+
120124
super().__init__(
121-
OpenAIEmbeddings( # type: ignore [call-arg]
122-
openai_api_key=config.openai_key,
125+
OpenAIEmbeddings(
126+
api_key=SecretStr(config.openai_key),
123127
chunk_size=16,
124128
max_retries=15,
125-
openai_api_type="azure",
126-
openai_api_version="2023-05-15",
127-
openai_api_base=config.api_base,
128-
deployment=config.deployment,
129+
model_kwargs={
130+
"api_type": "azure",
131+
"api_base": config.api_base,
132+
"api_version": "2023-05-15",
133+
"deployment": config.deployment,
134+
},
129135
disallowed_special=(),
130136
),
131137
chunk_size,
132-
) # type: ignore
138+
)
133139

134140

135141
COHERE_VECTOR_SIZE = 1024
@@ -197,11 +203,13 @@ def __init__(self, config: OpenAICompatibleEmbeddingConfigModel):
197203
# Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage.
198204
self.embeddings = LocalAIEmbeddings(
199205
model=config.model_name,
200-
openai_api_key=config.api_key or "dummy-api-key",
201-
openai_api_base=config.base_url,
206+
model_kwargs={
207+
"api_key": config.api_key or "dummy-api-key",
208+
"base_url": config.base_url,
209+
},
202210
max_retries=15,
203211
disallowed_special=(),
204-
) # type: ignore
212+
)
205213

206214
def check(self) -> Optional[str]:
207215
deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")

airbyte_cdk/sources/file_based/file_types/unstructured_parser.py

Lines changed: 96 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
33
#
44
import logging
5+
import mimetypes
56
import os
67
import traceback
78
from datetime import datetime
@@ -13,9 +14,6 @@
1314
import nltk
1415
import requests
1516
from unstructured.file_utils.filetype import (
16-
EXT_TO_FILETYPE,
17-
FILETYPE_TO_MIMETYPE,
18-
STR_TO_FILETYPE,
1917
FileType,
2018
detect_filetype,
2119
)
@@ -84,14 +82,34 @@ def _import_unstructured() -> None:
8482
global unstructured_partition_pdf
8583
global unstructured_partition_docx
8684
global unstructured_partition_pptx
87-
from unstructured.partition.docx import partition_docx
88-
from unstructured.partition.pdf import partition_pdf
89-
from unstructured.partition.pptx import partition_pptx
9085

91-
# separate global variables to properly propagate typing
92-
unstructured_partition_pdf = partition_pdf
93-
unstructured_partition_docx = partition_docx
94-
unstructured_partition_pptx = partition_pptx
86+
# Import docx and pptx partitioners
87+
try:
88+
from unstructured.partition.docx import partition_docx
89+
from unstructured.partition.pptx import partition_pptx
90+
91+
# Set docx and pptx partitioners
92+
unstructured_partition_docx = partition_docx
93+
unstructured_partition_pptx = partition_pptx
94+
except ImportError:
95+
# If docx or pptx partitioners are not available, set them to None
96+
unstructured_partition_docx = None
97+
unstructured_partition_pptx = None
98+
99+
# Try to import PDF partitioner, but handle the case when unstructured_inference is not available
100+
try:
101+
from unstructured.partition.pdf import partition_pdf
102+
103+
# separate global variables to properly propagate typing
104+
unstructured_partition_pdf = partition_pdf
105+
except ImportError as e:
106+
if "unstructured_inference" in str(e):
107+
# If unstructured_inference is not available, set PDF partitioner to None
108+
# This will be handled in _read_file_locally
109+
unstructured_partition_pdf = None
110+
else:
111+
# Re-raise if it's a different import error
112+
raise
95113

96114

97115
def user_error(e: Exception) -> bool:
@@ -207,13 +225,6 @@ def _read_file(
207225
logger: logging.Logger,
208226
) -> str:
209227
_import_unstructured()
210-
if (
211-
(not unstructured_partition_pdf)
212-
or (not unstructured_partition_docx)
213-
or (not unstructured_partition_pptx)
214-
):
215-
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
216-
raise Exception("unstructured library is not available")
217228

218229
filetype: FileType | None = self._get_filetype(file_handle, remote_file)
219230

@@ -227,6 +238,26 @@ def _read_file(
227238
decoded_content: str = optional_decode(file_content)
228239
return decoded_content
229240
if format.processing.mode == "local":
241+
# Check if required partitioners are available
242+
if filetype == FileType.PDF and not unstructured_partition_pdf:
243+
raise self._create_parse_error(
244+
remote_file,
245+
"PDF parsing requires the unstructured_inference package. "
246+
"Please install it with `pip install unstructured_inference` or use API mode instead.",
247+
)
248+
elif filetype == FileType.DOCX and not unstructured_partition_docx:
249+
raise self._create_parse_error(
250+
remote_file,
251+
"DOCX parsing requires the unstructured package with docx extras. "
252+
"Please install it with `pip install 'unstructured[docx]'` or use API mode instead.",
253+
)
254+
elif filetype == FileType.PPTX and not unstructured_partition_pptx:
255+
raise self._create_parse_error(
256+
remote_file,
257+
"PPTX parsing requires the unstructured package with pptx extras. "
258+
"Please install it with `pip install 'unstructured[pptx]'` or use API mode instead.",
259+
)
260+
230261
return self._read_file_locally(
231262
file_handle,
232263
filetype,
@@ -335,7 +366,11 @@ def _read_file_remotely(
335366

336367
data = self._params_to_dict(format.parameters, strategy)
337368

338-
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
369+
# Use Python's mimetypes module to get the MIME type for the file type
370+
mime_type = (
371+
mimetypes.guess_type(f"test.{filetype.name.lower()}")[0] or "application/octet-stream"
372+
)
373+
file_data = {"files": ("filename", file_handle, mime_type)}
339374

340375
response = requests.post(
341376
f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
@@ -355,15 +390,6 @@ def _read_file_remotely(
355390
def _read_file_locally(
356391
self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
357392
) -> str:
358-
_import_unstructured()
359-
if (
360-
(not unstructured_partition_pdf)
361-
or (not unstructured_partition_docx)
362-
or (not unstructured_partition_pptx)
363-
):
364-
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
365-
raise Exception("unstructured library is not available")
366-
367393
file: Any = file_handle
368394

369395
# before the parsing logic is entered, the file is read completely to make sure it is in local memory
@@ -373,15 +399,37 @@ def _read_file_locally(
373399

374400
try:
375401
if filetype == FileType.PDF:
402+
if not unstructured_partition_pdf:
403+
raise self._create_parse_error(
404+
remote_file,
405+
"PDF parsing requires the unstructured_inference package. "
406+
"Please install it with `pip install unstructured_inference` or use API mode instead.",
407+
)
376408
# for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects
377409
file_handle.seek(0)
378410
with BytesIO(file_handle.read()) as file:
379411
file_handle.seek(0)
380412
elements = unstructured_partition_pdf(file=file, strategy=strategy)
381413
elif filetype == FileType.DOCX:
414+
if not unstructured_partition_docx:
415+
raise self._create_parse_error(
416+
remote_file,
417+
"DOCX parsing requires the unstructured package with docx extras. "
418+
"Please install it with `pip install 'unstructured[docx]'` or use API mode instead.",
419+
)
382420
elements = unstructured_partition_docx(file=file)
383421
elif filetype == FileType.PPTX:
422+
if not unstructured_partition_pptx:
423+
raise self._create_parse_error(
424+
remote_file,
425+
"PPTX parsing requires the unstructured package with pptx extras. "
426+
"Please install it with `pip install 'unstructured[pptx]'` or use API mode instead.",
427+
)
384428
elements = unstructured_partition_pptx(file=file)
429+
else:
430+
raise self._create_parse_error(
431+
remote_file, f"Unsupported file type for local processing: {filetype}"
432+
)
385433
except Exception as e:
386434
raise self._create_parse_error(remote_file, str(e))
387435

@@ -405,8 +453,11 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
405453
2. Use the file name if available
406454
3. Use the file content
407455
"""
408-
if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE:
409-
return STR_TO_FILETYPE[remote_file.mime_type]
456+
# In the new version of unstructured, we need to use detect_filetype with content_type
457+
if remote_file.mime_type:
458+
detected_type = detect_filetype(content_type=remote_file.mime_type)
459+
if detected_type != FileType.UNK:
460+
return detected_type
410461

411462
# set name to none, otherwise unstructured will try to get the modified date from the local file system
412463
if hasattr(file, "name"):
@@ -418,7 +469,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
418469
file_type: FileType | None = None
419470
try:
420471
file_type = detect_filetype(
421-
filename=remote_file.uri,
472+
file_path=remote_file.uri,
422473
)
423474
except Exception:
424475
# Path doesn't exist locally. Try something else...
@@ -427,15 +478,26 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
427478
if file_type and file_type != FileType.UNK:
428479
return file_type
429480

430-
type_based_on_content = detect_filetype(file=file)
481+
# Convert IOBase to BytesIO for compatibility with detect_filetype
482+
file.seek(0)
483+
file_content = file.read()
484+
file.seek(0)
485+
from io import BytesIO
486+
487+
file_bytes = BytesIO(file_content)
488+
type_based_on_content = detect_filetype(file=file_bytes)
431489
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
432490

433491
if type_based_on_content and type_based_on_content != FileType.UNK:
434492
return type_based_on_content
435493

436-
extension = "." + remote_file.uri.split(".")[-1].lower()
437-
if extension in EXT_TO_FILETYPE:
438-
return EXT_TO_FILETYPE[extension]
494+
# Try to detect file type from extension
495+
extension = remote_file.uri.split(".")[-1].lower() if "." in remote_file.uri else ""
496+
if extension:
497+
# In the new version, we use detect_filetype with file_path
498+
detected_type = detect_filetype(file_path=f"test.{extension}")
499+
if detected_type != FileType.UNK:
500+
return detected_type
439501

440502
return None
441503

conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""
2+
This module contains pytest fixtures for the airbyte-python-cdk tests.
3+
"""
4+
5+
pytest_plugins = ["unit_tests.sources.file_based.file_types.test_unstructured_parser_patch"]

0 commit comments

Comments
 (0)