Skip to content

Commit 935da99

Browse files
authored
Added priority argument to all converter constructors. (#324)
* Added priority argument to all converter constructors.
1 parent 5ce85c2 commit 935da99

21 files changed

+135
-19
lines changed

packages/markitdown/src/markitdown/_markitdown.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,6 @@
4747
# Override mimetype for csv to fix issue on windows
4848
mimetypes.add_type("text/csv", ".csv")
4949

50-
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
51-
PRIORITY_GENERIC_FILE_FORMAT = 10.0
52-
53-
5450
_plugins: Union[None | List[Any]] = None
5551

5652

@@ -123,6 +119,8 @@ def enable_builtins(self, **kwargs) -> None:
123119
self._llm_model = kwargs.get("llm_model")
124120
self._exiftool_path = kwargs.get("exiftool_path")
125121
self._style_map = kwargs.get("style_map")
122+
if self._exiftool_path is None:
123+
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
126124

127125
# Register converters for successful browsing operations
128126
# Later registrations are tried first / take higher priority than earlier registrations
@@ -349,11 +347,10 @@ def _convert(
349347
_kwargs["_parent_converters"] = self._page_converters
350348

351349
# If we hit an error log it and keep trying
352-
# try:
353-
if True:
350+
try:
354351
res = converter.convert(local_path, **_kwargs)
355-
# except Exception:
356-
# error_trace = ("\n\n" + traceback.format_exc()).strip()
352+
except Exception:
353+
error_trace = ("\n\n" + traceback.format_exc()).strip()
357354

358355
if res is not None:
359356
# Normalize the content

packages/markitdown/src/markitdown/converters/_base.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,36 @@ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
1212
class DocumentConverter:
1313
"""Abstract superclass of all DocumentConverters."""
1414

15-
def __init__(self, priority: float = 0.0):
15+
# Lower priority values are tried first.
16+
PRIORITY_SPECIFIC_FILE_FORMAT = (
17+
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
18+
)
19+
PRIORITY_GENERIC_FILE_FORMAT = (
20+
10.0 # Near catch-all converters for mimetypes like text/*, etc.
21+
)
22+
23+
def __init__(self, priority: float = PRIORITY_SPECIFIC_FILE_FORMAT):
24+
"""
25+
Initialize the DocumentConverter with a given priority.
26+
27+
Priorities work as follows: By default, most converters get priority
28+
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
29+
is the PlainTextConverter, which gets priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10),
30+
with lower values being tried first (i.e., higher priority).
31+
32+
Just prior to conversion, the converters are sorted by priority, using
33+
a stable sort. This means that converters with the same priority will
34+
remain in the same order, with the most recently registered converters
35+
appearing first.
36+
37+
We have tight control over the order of built-in converters, but
38+
plugins can register converters in any order. A converter's priority
39+
field reasserts some control over the order of converters.
40+
41+
Plugins can register converters with any priority, to appear before or
42+
after the built-ins. For example, a plugin with priority 9 will run
43+
before the PlainTextConverter, but after the built-in converters.
44+
"""
1645
self._priority = priority
1746

1847
def convert(

packages/markitdown/src/markitdown/converters/_bing_serp_converter.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ class BingSerpConverter(DocumentConverter):
1616
NOTE: It is better to use the Bing API
1717
"""
1818

19+
def __init__(
20+
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
21+
):
22+
super().__init__(priority=priority)
23+
1924
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
2025
# Bail if not a Bing SERP
2126
extension = kwargs.get("file_extension", "")

packages/markitdown/src/markitdown/converters/_doc_intel_converter.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,13 @@ class DocumentIntelligenceConverter(DocumentConverter):
2222

2323
def __init__(
2424
self,
25+
*,
26+
priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT,
2527
endpoint: str,
2628
api_version: str = "2024-07-31-preview",
2729
):
30+
super().__init__(priority=priority)
31+
2832
self.endpoint = endpoint
2933
self.api_version = api_version
3034
self.doc_intel_client = DocumentIntelligenceClient(

packages/markitdown/src/markitdown/converters/_docx_converter.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
DocumentConverterResult,
77
)
88

9+
from ._base import DocumentConverter
910
from ._html_converter import HtmlConverter
1011

1112

@@ -14,6 +15,11 @@ class DocxConverter(HtmlConverter):
1415
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
1516
"""
1617

18+
def __init__(
19+
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
20+
):
21+
super().__init__(priority=priority)
22+
1723
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
1824
# Bail if not a DOCX
1925
extension = kwargs.get("file_extension", "")

packages/markitdown/src/markitdown/converters/_html_converter.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
class HtmlConverter(DocumentConverter):
99
"""Anything with content type text/html"""
1010

11+
def __init__(
12+
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
13+
):
14+
super().__init__(priority=priority)
15+
1116
def convert(
1217
self, local_path: str, **kwargs: Any
1318
) -> Union[None, DocumentConverterResult]:

packages/markitdown/src/markitdown/converters/_image_converter.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from typing import Union
2-
from ._base import DocumentConverterResult
2+
from ._base import DocumentConverter, DocumentConverterResult
33
from ._media_converter import MediaConverter
44

55

@@ -8,6 +8,11 @@ class ImageConverter(MediaConverter):
88
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
99
"""
1010

11+
def __init__(
12+
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
13+
):
14+
super().__init__(priority=priority)
15+
1116
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
1217
# Bail if not an image
1318
extension = kwargs.get("file_extension", "")

packages/markitdown/src/markitdown/converters/_ipynb_converter.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
class IpynbConverter(DocumentConverter):
1313
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
1414

15+
def __init__(
16+
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
17+
):
18+
super().__init__(priority=priority)
19+
1520
def convert(
1621
self, local_path: str, **kwargs: Any
1722
) -> Union[None, DocumentConverterResult]:

packages/markitdown/src/markitdown/converters/_media_converter.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ class MediaConverter(DocumentConverter):
1111
Abstract class for multi-modal media (e.g., images and audio)
1212
"""
1313

14+
def __init__(
15+
self, priority: float = DocumentConverter.PRIORITY_GENERIC_FILE_FORMAT
16+
):
17+
super().__init__(priority=priority)
18+
1419
def _get_metadata(self, local_path, exiftool_path=None):
1520
if not exiftool_path:
1621
which_exiftool = shutil.which("exiftool")
@@ -27,10 +32,10 @@ def _get_metadata(self, local_path, exiftool_path=None):
2732

2833
return None
2934
else:
30-
try:
35+
if True:
3136
result = subprocess.run(
3237
[exiftool_path, "-json", local_path], capture_output=True, text=True
3338
).stdout
3439
return json.loads(result)[0]
35-
except Exception:
36-
return None
40+
# except Exception:
41+
# return None

packages/markitdown/src/markitdown/converters/_mp3_converter.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import tempfile
22
from typing import Union
3-
from ._base import DocumentConverterResult
3+
from ._base import DocumentConverter, DocumentConverterResult
44
from ._wav_converter import WavConverter
55
from warnings import resetwarnings, catch_warnings
66

@@ -28,6 +28,11 @@ class Mp3Converter(WavConverter):
2828
Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
2929
"""
3030

31+
def __init__(
32+
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
33+
):
34+
super().__init__(priority=priority)
35+
3136
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
3237
# Bail if not a MP3
3338
extension = kwargs.get("file_extension", "")

0 commit comments

Comments
 (0)