Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions packages/markitdown/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ dependencies = [
[project.optional-dependencies]
all = [
"python-pptx",
"mammoth~=1.10.0",
"mammoth~=1.11.0",
"pandas",
"openpyxl",
"xlrd",
Expand All @@ -50,7 +50,7 @@ all = [
"azure-identity"
]
pptx = ["python-pptx"]
docx = ["mammoth", "lxml"]
docx = ["mammoth~=1.11.0", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,6 @@
_dependency_exc_info = None
try:
import mammoth
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing the monkey-patch that disabled r:link processing appears to be intentional for addressing CVE-2025-11849, presumably because mammoth 1.11.0 handles this securely. However, since version 1.11.0 doesn't exist yet, this change may introduce a security vulnerability by re-enabling r:link processing with mammoth 1.10.0. The code changes should be synchronized with the actual availability of the secure mammoth version.

Copilot uses AI. Check for mistakes.
import mammoth.docx.files

def mammoth_files_open(self, uri):
warn("DOCX: processing of r:link resources (e.g., linked images) is disabled.")
return io.BytesIO(b"")

mammoth.docx.files.Files.open = mammoth_files_open

except ImportError:
# Preserve the error and stack trace for later
Expand Down
Binary file added packages/markitdown/tests/test_files/rlink.docx
Binary file not shown.
48 changes: 45 additions & 3 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,47 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()

# Document with rlink
docx_file = os.path.join(TEST_FILES_DIR, "rlink.docx")

# Directory containing the target rlink file
rlink_tmp_dir = os.path.abspath(os.sep + "tmp")

# Ensure the tmp directory exists
if not os.path.exists(rlink_tmp_dir):
pytest.skip(f"Skipping rlink test; {rlink_tmp_dir} directory does not exist.")
return

Comment on lines +304 to +305
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The early return statement after pytest.skip is unnecessary. pytest.skip raises an exception that prevents further execution, so the return on line 304 will never be reached.

Suggested change
return

Copilot uses AI. Check for mistakes.
rlink_file_path = os.path.join(rlink_tmp_dir, "test_rlink.txt")
rlink_content = "de658225-569e-4e3d-9ed2-cfb6abf927fc"
b64_prefix = (
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" # base64 prefix of rlink_content
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment "# base64 prefix of rlink_content" is misleading. The value "ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" is truncated and not a complete base64 encoding. The full base64 encoding of "de658225-569e-4e3d-9ed2-cfb6abf927fc" would be "ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjkyN2Zj". Consider either using the complete base64 string or clarifying in the comment that this is a partial/prefix match.

Suggested change
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" # base64 prefix of rlink_content
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjkyN2Zj" # base64 encoding of rlink_content

Copilot uses AI. Check for mistakes.
)

if os.path.exists(rlink_file_path):
with open(rlink_file_path, "r", encoding="utf-8") as f:
existing_content = f.read()
if existing_content != rlink_content:
raise ValueError(
f"Existing {rlink_file_path} content does not match expected content."
)
else:
with open(rlink_file_path, "w", encoding="utf-8") as f:
f.write(rlink_content)

try:
Comment on lines +312 to +323
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test skips when /tmp doesn't exist but doesn't clean up the created file if the test fails between file creation (line 321) and the finally block. If an assertion fails or an exception is raised, the file will be left behind. Consider wrapping the file creation in the try block or using a proper temporary directory with proper cleanup.

Suggested change
if os.path.exists(rlink_file_path):
with open(rlink_file_path, "r", encoding="utf-8") as f:
existing_content = f.read()
if existing_content != rlink_content:
raise ValueError(
f"Existing {rlink_file_path} content does not match expected content."
)
else:
with open(rlink_file_path, "w", encoding="utf-8") as f:
f.write(rlink_content)
try:
try:
if os.path.exists(rlink_file_path):
with open(rlink_file_path, "r", encoding="utf-8") as f:
existing_content = f.read()
if existing_content != rlink_content:
raise ValueError(
f"Existing {rlink_file_path} content does not match expected content."
)
else:
with open(rlink_file_path, "w", encoding="utf-8") as f:
f.write(rlink_content)

Copilot uses AI. Check for mistakes.
result = markitdown.convert(docx_file, keep_data_uris=True).text_content
assert (
b64_prefix not in result
) # Make sure the target file was NOT embedded in the output
finally:
os.remove(rlink_file_path)
Comment on lines +328 to +329
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test unconditionally deletes the rlink file in the finally block, even if the file existed before the test ran. This could delete user data if the file was already present. Consider only deleting the file if the test created it (when os.path.exists returned False initially).

Copilot uses AI. Check for mistakes.


@pytest.mark.skipif(
skip_remote,
reason="do not run tests that query external urls",
Expand All @@ -301,9 +342,9 @@ def test_markitdown_remote() -> None:
assert test_string in result.text_content

# Youtube
result = markitdown.convert(YOUTUBE_TEST_URL)
for test_string in YOUTUBE_TEST_STRINGS:
assert test_string in result.text_content
# result = markitdown.convert(YOUTUBE_TEST_URL)
# for test_string in YOUTUBE_TEST_STRINGS:
# assert test_string in result.text_content


Comment on lines +345 to 349
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment appears to contain commented-out code.

Suggested change
# result = markitdown.convert(YOUTUBE_TEST_URL)
# for test_string in YOUTUBE_TEST_STRINGS:
# assert test_string in result.text_content

Copilot uses AI. Check for mistakes.
@pytest.mark.skipif(
Expand Down Expand Up @@ -452,6 +493,7 @@ def test_markitdown_llm() -> None:
test_markitdown_remote,
test_speech_transcription,
test_exceptions,
test_doc_rlink,
test_markitdown_exiftool,
test_markitdown_llm_parameters,
test_markitdown_llm,
Expand Down
Loading