Added priority argument to all converter constructors. (#324)

afourney · web-flow · commit 935da9976c9b · 2025-02-11T12:36:32.000-08:00
* Added priority argument to all converter constructors.
diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -47,10 +47,6 @@
 # Override mimetype for csv to fix issue on windows
 mimetypes.add_type("text/csv", ".csv")
 
-PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
-PRIORITY_GENERIC_FILE_FORMAT = 10.0
-
-
 _plugins: Union[None | List[Any]] = None
 
 
@@ -123,6 +119,8 @@ def enable_builtins(self, **kwargs) -> None:
             self._llm_model = kwargs.get("llm_model")
             self._exiftool_path = kwargs.get("exiftool_path")
             self._style_map = kwargs.get("style_map")
+            if self._exiftool_path is None:
+                self._exiftool_path = os.getenv("EXIFTOOL_PATH")
 
             # Register converters for successful browsing operations
             # Later registrations are tried first / take higher priority than earlier registrations
@@ -349,11 +347,10 @@ def _convert(
                 _kwargs["_parent_converters"] = self._page_converters
 
                 # If we hit an error log it and keep trying
-                # try:
-                if True:
+                try:
                     res = converter.convert(local_path, **_kwargs)
-                # except Exception:
-                #    error_trace = ("\n\n" + traceback.format_exc()).strip()
+                except Exception:
+                    error_trace = ("\n\n" + traceback.format_exc()).strip()
 
                 if res is not None:
                     # Normalize the content
diff --git a/packages/markitdown/src/markitdown/converters/_base.py b/packages/markitdown/src/markitdown/converters/_base.py
@@ -12,7 +12,36 @@ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
 class DocumentConverter:
     """Abstract superclass of all DocumentConverters."""
 
-    def __init__(self, priority: float = 0.0):
+    # Lower priority values are tried first.
+    PRIORITY_SPECIFIC_FILE_FORMAT = (
+        0.0  # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
+    )
+    PRIORITY_GENERIC_FILE_FORMAT = (
+        10.0  # Near catch-all converters for mimetypes like text/*, etc.
+    )
+
+    def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
+        """
+        Initialize the DocumentConverter with a given priority.
+
+        Priorities work as follows: By default, most converters get priority
+        DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
+        is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
+        with lower values being tried first (i.e., higher priority).
+
+        Just prior to conversion, the converters are sorted by priority, using
+        a stable sort. This means that converters with the same priority will
+        remain in the same order, with the most recently registered converters
+        appearing first.
+
+        We have tight control over the order of built-in converters, but
+        plugins can register converters in any order. A converter's priority
+        field reasserts some control over the order of converters.
+
+        Plugins can register converters with any priority, to appear before or
+        after the built-ins. For example, a plugin with priority 9 will run
+        before the PlainTextConverter, but after the built-in converters.
+        """
         self._priority = priority
 
     def convert(
diff --git a/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py b/packages/markitdown/src/markitdown/converters/_bing_serp_converter.py
@@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
     NOTE: It is better to use the Bing API
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a Bing SERP
         extension = kwargs.get("file_extension", "")
diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
 
     def __init__(
         self,
+        *,
+        priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
         endpoint: str,
         api_version: str = "2024-07-31-preview",
     ):
+        super().__init__(priority=priority)
+
         self.endpoint = endpoint
         self.api_version = api_version
         self.doc_intel_client = DocumentIntelligenceClient(
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -6,6 +6,7 @@
     DocumentConverterResult,
 )
 
+from ._base import DocumentConverter
 from ._html_converter import HtmlConverter
 
 
@@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a DOCX
         extension = kwargs.get("file_extension", "")
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -8,6 +8,11 @@
 class HtmlConverter(DocumentConverter):
     """Anything with content type text/html"""
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/src/markitdown/converters/_image_converter.py b/packages/markitdown/src/markitdown/converters/_image_converter.py
@@ -1,5 +1,5 @@
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
 
 
@@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
     Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not an image
         extension = kwargs.get("file_extension", "")
diff --git a/packages/markitdown/src/markitdown/converters/_ipynb_converter.py b/packages/markitdown/src/markitdown/converters/_ipynb_converter.py
@@ -12,6 +12,11 @@
 class IpynbConverter(DocumentConverter):
     """Converts Jupyter Notebook (.ipynb) files to Markdown."""
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/src/markitdown/converters/_media_converter.py b/packages/markitdown/src/markitdown/converters/_media_converter.py
@@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
     Abstract class for multi-modal media (e.g., images and audio)
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def _get_metadata(self, local_path, exiftool_path=None):
         if not exiftool_path:
             which_exiftool = shutil.which("exiftool")
@@ -27,10 +32,10 @@ def _get_metadata(self, local_path, exiftool_path=None):
 
             return None
         else:
-            try:
+            if True:
                 result = subprocess.run(
                     [exiftool_path, "-json", local_path], capture_output=True, text=True
                 ).stdout
                 return json.loads(result)[0]
-            except Exception:
-                return None
+            # except Exception:
+            #    return None
diff --git a/packages/markitdown/src/markitdown/converters/_mp3_converter.py b/packages/markitdown/src/markitdown/converters/_mp3_converter.py
@@ -1,6 +1,6 @@
 import tempfile
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._wav_converter import WavConverter
 from warnings import resetwarnings, catch_warnings
 
@@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
     Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a MP3
         extension = kwargs.get("file_extension", "")
diff --git a/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py b/packages/markitdown/src/markitdown/converters/_outlook_msg_converter.py
@@ -11,6 +11,11 @@ class OutlookMsgConverter(DocumentConverter):
     - Email body content
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -9,6 +9,11 @@ class PdfConverter(DocumentConverter):
     Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a PDF
         extension = kwargs.get("file_extension", "")
diff --git a/packages/markitdown/src/markitdown/converters/_plain_text_converter.py b/packages/markitdown/src/markitdown/converters/_plain_text_converter.py
@@ -9,6 +9,11 @@
 class PlainTextConverter(DocumentConverter):
     """Anything with content type text/plain"""
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/src/markitdown/converters/_pptx_converter.py b/packages/markitdown/src/markitdown/converters/_pptx_converter.py
@@ -14,6 +14,11 @@ class PptxConverter(HtmlConverter):
     Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def _get_llm_description(
         self, llm_client, llm_model, image_blob, content_type, prompt=None
     ):
diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py
@@ -9,6 +9,11 @@
 class RssConverter(DocumentConverter):
     """Convert RSS / Atom type to markdown"""
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/src/markitdown/converters/_wav_converter.py b/packages/markitdown/src/markitdown/converters/_wav_converter.py
@@ -1,5 +1,5 @@
 from typing import Union
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._media_converter import MediaConverter
 
 # Optional Transcription support
@@ -17,6 +17,11 @@ class WavConverter(MediaConverter):
     Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a WAV
         extension = kwargs.get("file_extension", "")
diff --git a/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py b/packages/markitdown/src/markitdown/converters/_wikipedia_converter.py
@@ -10,6 +10,11 @@
 class WikipediaConverter(DocumentConverter):
     """Handle Wikipedia pages separately, focusing only on the main document content."""
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-from ._base import DocumentConverterResult
+from ._base import DocumentConverter, DocumentConverterResult
 from ._html_converter import HtmlConverter
 
 
@@ -11,6 +11,11 @@ class XlsxConverter(HtmlConverter):
     Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a XLSX
         extension = kwargs.get("file_extension", "")
diff --git a/packages/markitdown/src/markitdown/converters/_youtube_converter.py b/packages/markitdown/src/markitdown/converters/_youtube_converter.py
@@ -19,6 +19,11 @@
 class YouTubeConverter(DocumentConverter):
     """Handle YouTube specially, focusing on the video title, description, and transcript."""
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -45,6 +45,11 @@ class ZipConverter(DocumentConverter):
     - Cleans up temporary files after processing
     """
 
+    def __init__(
+        self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
+    ):
+        super().__init__(priority=priority)
+
     def convert(
         self, local_path: str, **kwargs: Any
     ) -> Union[None, DocumentConverterResult]:
diff --git a/packages/markitdown/tests/test_markitdown.py b/packages/markitdown/tests/test_markitdown.py
@@ -327,8 +327,8 @@ def test_markitdown_llm() -> None:
 
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""
-    # test_markitdown_remote()
-    # test_markitdown_local()
+    test_markitdown_remote()
+    test_markitdown_local()
     test_markitdown_exiftool()
-    # test_markitdown_deprecation()
     # test_markitdown_llm()
+    print("All tests passed!")