22# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
33#
44import logging
5+ import mimetypes
56import os
67import traceback
78from datetime import datetime
1314import nltk
1415import requests
1516from unstructured .file_utils .filetype import (
16- EXT_TO_FILETYPE ,
17- FILETYPE_TO_MIMETYPE ,
18- STR_TO_FILETYPE ,
1917 FileType ,
2018 detect_filetype ,
2119)
@@ -84,14 +82,34 @@ def _import_unstructured() -> None:
8482 global unstructured_partition_pdf
8583 global unstructured_partition_docx
8684 global unstructured_partition_pptx
87- from unstructured .partition .docx import partition_docx
88- from unstructured .partition .pdf import partition_pdf
89- from unstructured .partition .pptx import partition_pptx
9085
91- # separate global variables to properly propagate typing
92- unstructured_partition_pdf = partition_pdf
93- unstructured_partition_docx = partition_docx
94- unstructured_partition_pptx = partition_pptx
86+ # Import docx and pptx partitioners
87+ try :
88+ from unstructured .partition .docx import partition_docx
89+ from unstructured .partition .pptx import partition_pptx
90+
91+ # Set docx and pptx partitioners
92+ unstructured_partition_docx = partition_docx
93+ unstructured_partition_pptx = partition_pptx
94+ except ImportError :
95+ # If docx or pptx partitioners are not available, set them to None
96+ unstructured_partition_docx = None
97+ unstructured_partition_pptx = None
98+
99+ # Try to import PDF partitioner, but handle the case when unstructured_inference is not available
100+ try :
101+ from unstructured .partition .pdf import partition_pdf
102+
103+ # separate global variables to properly propagate typing
104+ unstructured_partition_pdf = partition_pdf
105+ except ImportError as e :
106+ if "unstructured_inference" in str (e ):
107+ # If unstructured_inference is not available, set PDF partitioner to None
108+ # This will be handled in _read_file_locally
109+ unstructured_partition_pdf = None
110+ else :
111+ # Re-raise if it's a different import error
112+ raise
95113
96114
97115def user_error (e : Exception ) -> bool :
@@ -207,13 +225,6 @@ def _read_file(
207225 logger : logging .Logger ,
208226 ) -> str :
209227 _import_unstructured ()
210- if (
211- (not unstructured_partition_pdf )
212- or (not unstructured_partition_docx )
213- or (not unstructured_partition_pptx )
214- ):
215- # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
216- raise Exception ("unstructured library is not available" )
217228
218229 filetype : FileType | None = self ._get_filetype (file_handle , remote_file )
219230
@@ -227,6 +238,26 @@ def _read_file(
227238 decoded_content : str = optional_decode (file_content )
228239 return decoded_content
229240 if format .processing .mode == "local" :
241+ # Check if required partitioners are available
242+ if filetype == FileType .PDF and not unstructured_partition_pdf :
243+ raise self ._create_parse_error (
244+ remote_file ,
245+ "PDF parsing requires the unstructured_inference package. "
246+ "Please install it with `pip install unstructured_inference` or use API mode instead." ,
247+ )
248+ elif filetype == FileType .DOCX and not unstructured_partition_docx :
249+ raise self ._create_parse_error (
250+ remote_file ,
251+ "DOCX parsing requires the unstructured package with docx extras. "
252+ "Please install it with `pip install 'unstructured[docx]'` or use API mode instead." ,
253+ )
254+ elif filetype == FileType .PPTX and not unstructured_partition_pptx :
255+ raise self ._create_parse_error (
256+ remote_file ,
257+ "PPTX parsing requires the unstructured package with pptx extras. "
258+ "Please install it with `pip install 'unstructured[pptx]'` or use API mode instead." ,
259+ )
260+
230261 return self ._read_file_locally (
231262 file_handle ,
232263 filetype ,
@@ -335,7 +366,11 @@ def _read_file_remotely(
335366
336367 data = self ._params_to_dict (format .parameters , strategy )
337368
338- file_data = {"files" : ("filename" , file_handle , FILETYPE_TO_MIMETYPE [filetype ])}
369+ # Use Python's mimetypes module to get the MIME type for the file type
370+ mime_type = (
371+ mimetypes .guess_type (f"test.{ filetype .name .lower ()} " )[0 ] or "application/octet-stream"
372+ )
373+ file_data = {"files" : ("filename" , file_handle , mime_type )}
339374
340375 response = requests .post (
341376 f"{ format .api_url } /general/v0/general" , headers = headers , data = data , files = file_data
@@ -355,15 +390,6 @@ def _read_file_remotely(
355390 def _read_file_locally (
356391 self , file_handle : IOBase , filetype : FileType , strategy : str , remote_file : RemoteFile
357392 ) -> str :
358- _import_unstructured ()
359- if (
360- (not unstructured_partition_pdf )
361- or (not unstructured_partition_docx )
362- or (not unstructured_partition_pptx )
363- ):
364- # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
365- raise Exception ("unstructured library is not available" )
366-
367393 file : Any = file_handle
368394
369395 # before the parsing logic is entered, the file is read completely to make sure it is in local memory
@@ -373,15 +399,37 @@ def _read_file_locally(
373399
374400 try :
375401 if filetype == FileType .PDF :
402+ if not unstructured_partition_pdf :
403+ raise self ._create_parse_error (
404+ remote_file ,
405+ "PDF parsing requires the unstructured_inference package. "
406+ "Please install it with `pip install unstructured_inference` or use API mode instead." ,
407+ )
376408 # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects
377409 file_handle .seek (0 )
378410 with BytesIO (file_handle .read ()) as file :
379411 file_handle .seek (0 )
380412 elements = unstructured_partition_pdf (file = file , strategy = strategy )
381413 elif filetype == FileType .DOCX :
414+ if not unstructured_partition_docx :
415+ raise self ._create_parse_error (
416+ remote_file ,
417+ "DOCX parsing requires the unstructured package with docx extras. "
418+ "Please install it with `pip install 'unstructured[docx]'` or use API mode instead." ,
419+ )
382420 elements = unstructured_partition_docx (file = file )
383421 elif filetype == FileType .PPTX :
422+ if not unstructured_partition_pptx :
423+ raise self ._create_parse_error (
424+ remote_file ,
425+ "PPTX parsing requires the unstructured package with pptx extras. "
426+ "Please install it with `pip install 'unstructured[pptx]'` or use API mode instead." ,
427+ )
384428 elements = unstructured_partition_pptx (file = file )
429+ else :
430+ raise self ._create_parse_error (
431+ remote_file , f"Unsupported file type for local processing: { filetype } "
432+ )
385433 except Exception as e :
386434 raise self ._create_parse_error (remote_file , str (e ))
387435
@@ -405,8 +453,11 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
405453 2. Use the file name if available
406454 3. Use the file content
407455 """
408- if remote_file .mime_type and remote_file .mime_type in STR_TO_FILETYPE :
409- return STR_TO_FILETYPE [remote_file .mime_type ]
456+ # In the new version of unstructured, we need to use detect_filetype with content_type
457+ if remote_file .mime_type :
458+ detected_type = detect_filetype (content_type = remote_file .mime_type )
459+ if detected_type != FileType .UNK :
460+ return detected_type
410461
411462 # set name to none, otherwise unstructured will try to get the modified date from the local file system
412463 if hasattr (file , "name" ):
@@ -418,7 +469,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
418469 file_type : FileType | None = None
419470 try :
420471 file_type = detect_filetype (
421- filename = remote_file .uri ,
472+ file_path = remote_file .uri ,
422473 )
423474 except Exception :
424475 # Path doesn't exist locally. Try something else...
@@ -427,15 +478,26 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT
427478 if file_type and file_type != FileType .UNK :
428479 return file_type
429480
430- type_based_on_content = detect_filetype (file = file )
481+ # Convert IOBase to BytesIO for compatibility with detect_filetype
482+ file .seek (0 )
483+ file_content = file .read ()
484+ file .seek (0 )
485+ from io import BytesIO
486+
487+ file_bytes = BytesIO (file_content )
488+ type_based_on_content = detect_filetype (file = file_bytes )
431489 file .seek (0 ) # detect_filetype is reading to read the file content, so we need to reset
432490
433491 if type_based_on_content and type_based_on_content != FileType .UNK :
434492 return type_based_on_content
435493
436- extension = "." + remote_file .uri .split ("." )[- 1 ].lower ()
437- if extension in EXT_TO_FILETYPE :
438- return EXT_TO_FILETYPE [extension ]
494+ # Try to detect file type from extension
495+ extension = remote_file .uri .split ("." )[- 1 ].lower () if "." in remote_file .uri else ""
496+ if extension :
497+ # In the new version, we use detect_filetype with file_path
498+ detected_type = detect_filetype (file_path = f"test.{ extension } " )
499+ if detected_type != FileType .UNK :
500+ return detected_type
439501
440502 return None
441503
0 commit comments